Example #1
0
 def testInflector(self):
     forms = rewrite.rewrites("grádus[case=gen][num=pl]",
                              self.paradigm_a.inflector)
     self.assertSameElements(["grádusov"], forms)
     forms = rewrite.rewrites("stól[case=gen][num=pl]",
                              self.paradigm_b.inflector)
     self.assertSameElements(["stolóv"], forms)
Example #2
0
 def testInflector(self):
     forms = rewrite.rewrites("aqua[case=gen][num=pl]",
                              self.paradigm.inflector)
     self.assertSameElements(["aquārum"], forms)
     forms = rewrite.rewrites("puella[case=dat][num=pl]",
                              self.paradigm.inflector)
     self.assertSameElements(["puellīs"], forms)
Example #3
0
 def testTagger(self):
     tagger = self.paradigm.tagger @ self.paradigm.feature_label_rewriter
     forms = rewrite.rewrites("aquārum", tagger)
     self.assertSameElements(["aquārum[case=gen][num=pl]"], forms)
     forms = rewrite.rewrites("puellīs", tagger)
     self.assertSameElements(
         ["puellīs[case=dat][num=pl]", "puellīs[case=abl][num=pl]"], forms)
Example #4
0
 def testAnalyzer(self):
     analyzer = self.paradigm_a.analyzer @ self.paradigm_a.feature_label_rewriter
     forms = rewrite.rewrites("grádusov", analyzer)
     self.assertSameElements(["grádus+ov[case=gen][num=pl]"], forms)
     analyzer = self.paradigm_b.analyzer @ self.paradigm_b.feature_label_rewriter
     forms = rewrite.rewrites("stolóv", analyzer)
     self.assertSameElements(["stol+óv[case=gen][num=pl]"], forms)
Example #5
0
 def testLemmatizer(self):
     lemmatizer = self.paradigm.lemmatizer @ self.paradigm.feature_label_rewriter
     forms = rewrite.rewrites("pacem", lemmatizer)
     self.assertSameElements(["pax[case=acc][num=sg]"], forms)
     forms = rewrite.rewrites("noctibus", lemmatizer)
     self.assertSameElements(
         ["nox[case=dat][num=pl]", "nox[case=abl][num=pl]"], forms)
Example #6
0
 def testInflector(self):
     forms = rewrite.rewrites("pax[case=acc][num=sg]",
                              self.paradigm.inflector)
     self.assertSameElements(["pacem"], forms)
     forms = rewrite.rewrites("nox[case=dat][num=pl]",
                              self.paradigm.inflector)
     self.assertSameElements(["noctibus"], forms)
Example #7
0
 def testLemmatizer(self):
     lemmatizer = self.paradigm.lemmatizer @ self.paradigm.feature_label_rewriter
     forms = rewrite.rewrites("aquārum", lemmatizer)
     self.assertSameElements(["aqua[case=gen][num=pl]"], forms)
     forms = rewrite.rewrites("puellīs", lemmatizer)
     self.assertSameElements(
         ["puella[case=dat][num=pl]", "puella[case=abl][num=pl]"], forms)
Example #8
0
 def testGenerator(self):
     generator = (self.paradigm.stems_to_forms
                  @ self.paradigm.feature_label_rewriter)
     forms = rewrite.rewrites("noct", generator)
     self.assertSameElements(
         [
             "nox+[case=nom][num=sg]",
             "noct+is[case=gen][num=sg]",
             "noct+ī[case=dat][num=sg]",
             "noct+em[case=acc][num=sg]",
             "noct+e[case=abl][num=sg]",
             "noct+ēs[case=nom][num=pl]",
             "noct+um[case=gen][num=pl]",  # TODO(rws): Actually "noctium".
             "noct+ibus[case=dat][num=pl]",
             "noct+ēs[case=acc][num=pl]",  # Also -īs for /i/ stems.
             "noct+ibus[case=abl][num=pl]"
         ],
         forms)
     forms = rewrite.rewrites("rēg", generator)
     self.assertSameElements([
         "rēx+[case=nom][num=sg]", "rēg+is[case=gen][num=sg]",
         "rēg+ī[case=dat][num=sg]", "rēg+em[case=acc][num=sg]",
         "rēg+e[case=abl][num=sg]", "rēg+ēs[case=nom][num=pl]",
         "rēg+um[case=gen][num=pl]", "rēg+ibus[case=dat][num=pl]",
         "rēg+ēs[case=acc][num=pl]", "rēg+ibus[case=abl][num=pl]"
     ], forms)
     forms = rewrite.rewrites("ōs", generator)
     self.assertSameElements([
         "ōs+[case=nom][num=sg]", "ōr+is[case=gen][num=sg]",
         "ōr+ī[case=dat][num=sg]", "ōr+em[case=acc][num=sg]",
         "ōr+e[case=abl][num=sg]", "ōr+ēs[case=nom][num=pl]",
         "ōr+um[case=gen][num=pl]", "ōr+ibus[case=dat][num=pl]",
         "ōr+ēs[case=acc][num=pl]", "ōr+ibus[case=abl][num=pl]"
     ], forms)
Example #9
0
 def testTagger(self):
     tagger = self.paradigm_a.tagger @ self.paradigm_a.feature_label_rewriter
     forms = rewrite.rewrites("grádusov", tagger)
     self.assertSameElements(["grádusov[case=gen][num=pl]"], forms)
     tagger = self.paradigm_b.tagger @ self.paradigm_b.feature_label_rewriter
     forms = rewrite.rewrites("stolóv", tagger)
     self.assertSameElements(["stolóv[case=gen][num=pl]"], forms)
Example #10
0
 def testLemmatizer(self):
     lemmatizer = (self.paradigm_a.lemmatizer
                   @ self.paradigm_a.feature_label_rewriter)
     forms = rewrite.rewrites("grádusov", lemmatizer)
     self.assertSameElements(["grádus[case=gen][num=pl]"], forms)
     lemmatizer = (self.paradigm_b.lemmatizer
                   @ self.paradigm_b.feature_label_rewriter)
     forms = rewrite.rewrites("stolóv", lemmatizer)
     self.assertSameElements(["stól[case=gen][num=pl]"], forms)
Example #11
0
 def testSetStemToForms(self):
     stems_and_forms = [
         ("caw", [
             "caw+al[aspect=dubitative]", "caw+inay[aspect=gerundial]",
             "cawaa+ʔaa[aspect=durative]", "caw[aspect=root]"
         ]),
         ("cuum", [
             "cuum+al[aspect=dubitative]", "cum+inay[aspect=gerundial]",
             "cumuu+ʔaa[aspect=durative]", "cuum[aspect=root]"
         ]),
         ("diiyl", [
             "diiyl+al[aspect=dubitative]", "diyl+inay[aspect=gerundial]",
             "diyiil+ʔaa[aspect=durative]", "diiyl[aspect=root]"
         ]),
         ("hiwiit", [
             "hiwiit+al[aspect=dubitative]", "hiwt+inay[aspect=gerundial]",
             "hiwiit+ʔaa[aspect=durative]", "hiwiit[aspect=root]"
         ]),
         ("hoyoo", [
             "hoyoo+al[aspect=dubitative]", "hoy+inay[aspect=gerundial]",
             "hoyoo+ʔaa[aspect=durative]", "hoyoo[aspect=root]"
         ]),
         ("ʔilk", [
             "ʔilk+al[aspect=dubitative]", "ʔilk+inay[aspect=gerundial]",
             "ʔiliik+ʔaa[aspect=durative]", "ʔilk[aspect=root]"
         ])
     ]
     generate = (self.paradigm.stems_to_forms
                 @ self.paradigm.feature_label_rewriter)
     for (stem, expected) in stems_and_forms:
         predicted = rewrite.rewrites(stem, generate)
         self.assertSameElements(expected, predicted)
Example #12
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_pre_process: bool = True,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text

        text = pynini.escape(text)

        if n_tagged == -1:
            tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            tagged_texts = rewrite.top_rewrites(text,
                                                self.tagger.fst,
                                                nshortest=n_tagged)
        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == 'en':
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts)

        if len(normalized_texts) == 0:
            raise ValueError()
        if punct_post_process:
            normalized_texts = [
                post_process_punctuation(t) for t in normalized_texts
            ]

            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
        normalized_texts = set(normalized_texts)
        return normalized_texts
Example #13
0
 def testGenerator(self):
     generator = (self.paradigm.stems_to_forms
                  @ self.paradigm.feature_label_rewriter)
     forms = rewrite.rewrites("noct__1000__", generator)
     self.assertSameElements(
         [
             "nox__1000__+[case=nom][num=sg]",
             "noct__1000__+is[case=gen][num=sg]",
             "noct__1000__+ī[case=dat][num=sg]",
             "noct__1000__+em[case=acc][num=sg]",
             "noct__1000__+e[case=abl][num=sg]",
             "noct__1000__+ēs[case=nom][num=pl]",
             "noct__1000__+um[case=gen][num=pl]",
             "noct__1000__+ibus[case=dat][num=pl]",
             "noct__1000__+ēs[case=acc][num=pl]",  # Also -īs for /i/ stems.
             "noct__1000__+ibus[case=abl][num=pl]"
         ],
         forms)
     forms = rewrite.rewrites("rēg__1003__", generator)
     self.assertSameElements([
         "rēx__1003__+[case=nom][num=sg]",
         "rēg__1003__+is[case=gen][num=sg]",
         "rēg__1003__+ī[case=dat][num=sg]",
         "rēg__1003__+em[case=acc][num=sg]",
         "rēg__1003__+e[case=abl][num=sg]",
         "rēg__1003__+ēs[case=nom][num=pl]",
         "rēg__1003__+um[case=gen][num=pl]",
         "rēg__1003__+ibus[case=dat][num=pl]",
         "rēg__1003__+ēs[case=acc][num=pl]",
         "rēg__1003__+ibus[case=abl][num=pl]"
     ], forms)
     forms = rewrite.rewrites("ōs__1001__", generator)
     self.assertSameElements([
         "ōs__1001__+[case=nom][num=sg]", "ōr__1001__+is[case=gen][num=sg]",
         "ōr__1001__+ī[case=dat][num=sg]",
         "ōr__1001__+em[case=acc][num=sg]",
         "ōr__1001__+e[case=abl][num=sg]",
         "ōr__1001__+ēs[case=nom][num=pl]",
         "ōr__1001__+um[case=gen][num=pl]",
         "ōr__1001__+ibus[case=dat][num=pl]",
         "ōr__1001__+ēs[case=acc][num=pl]",
         "ōr__1001__+ibus[case=abl][num=pl]"
     ], forms)
Example #14
0
 def _get_tagged_text(self, text, n_tagged):
     """
     Returns text after tokenize and classify
     Args;
         text: input  text
         n_tagged: number of tagged options to consider, -1 - return all possible tagged options
     """
     if n_tagged == -1:
         if self.lang == "en":
             # this to keep arpabet phonemes in the list of options
             if "[" in text and "]" in text:
                 tagged_texts = rewrite.rewrites(text, self.tagger.fst)
             else:
                 try:
                     tagged_texts = rewrite.rewrites(
                         text, self.tagger.fst_no_digits)
                 except pynini.lib.rewrite.Error:
                     tagged_texts = rewrite.rewrites(text, self.tagger.fst)
         else:
             tagged_texts = rewrite.rewrites(text, self.tagger.fst)
     else:
         if self.lang == "en":
             # this to keep arpabet phonemes in the list of options
             if "[" in text and "]" in text:
                 tagged_texts = rewrite.top_rewrites(text,
                                                     self.tagger.fst,
                                                     nshortest=n_tagged)
             else:
                 try:
                     # try self.tagger graph that produces output without digits
                     tagged_texts = rewrite.top_rewrites(
                         text,
                         self.tagger.fst_no_digits,
                         nshortest=n_tagged)
                 except pynini.lib.rewrite.Error:
                     tagged_texts = rewrite.top_rewrites(text,
                                                         self.tagger.fst,
                                                         nshortest=n_tagged)
         else:
             tagged_texts = rewrite.top_rewrites(text,
                                                 self.tagger.fst,
                                                 nshortest=n_tagged)
     return tagged_texts
Example #15
0
 def testGenerator(self):
     generator = (self.paradigm.stems_to_forms
                  @ self.paradigm.feature_label_rewriter)
     forms = rewrite.rewrites("aqu", generator)
     self.assertSameElements([
         "aqu+a[case=nom][num=sg]", "aqu+ae[case=gen][num=sg]",
         "aqu+ae[case=dat][num=sg]", "aqu+am[case=acc][num=sg]",
         "aqu+ā[case=abl][num=sg]", "aqu+ae[case=nom][num=pl]",
         "aqu+ārum[case=gen][num=pl]", "aqu+īs[case=dat][num=pl]",
         "aqu+ās[case=acc][num=pl]", "aqu+īs[case=abl][num=pl]"
     ], forms)
Example #16
0
 def testOptionalRewrite(self):
     rule = pynini.cdrewrite(pynutil.delete(self.td),
                             self.consonant,
                             "[EOS]",
                             self.sigstar,
                             mode="opt").optimize()
     with self.assertRaisesRegex(rewrite.Error, r"Multiple top rewrites"):
         unused_var = rewrite.one_top_rewrite("fist", rule)
     self.assertCountEqual(["fist", "fis"], rewrite.rewrites("fist", rule))
     self.assertTrue(rewrite.matches("fist", "fis", rule))
     self.assertTrue(rewrite.matches("fist", "fist", rule))
     self.assertFalse(rewrite.matches("fis", "fist", rule))
Example #17
0
 def testMandatoryRewrite(self):
     rule = pynini.cdrewrite(pynutil.delete(self.td), self.consonant,
                             "[EOS]", self.sigstar).optimize()
     rewrites = tuple(rewrite.rewrites("fist", rule))
     # pylint: disable=g-generic-assert
     self.assertEqual(len(rewrites), 1)
     # pylint: enable=g-generic-assert
     self.assertEqual("fis", rewrites[0])
     self.assertEqual("fis", rewrite.top_rewrite("fist", rule))
     self.assertEqual("fis", rewrite.one_top_rewrite("fist", rule))
     self.assertTrue(rewrite.matches("fist", "fis", rule))
     self.assertFalse(rewrite.matches("fis", "fist", rule))
Example #18
0
 def testGenerator(self):
     generator = (self.paradigm_a.stems_to_forms
                  @ self.paradigm_a.feature_label_rewriter)
     forms = rewrite.rewrites("grádus", generator)
     self.assertSameElements([
         "grádus[case=nom][num=sg]", "grádus+a[case=gen][num=sg]",
         "grádus+u[case=dat][num=sg]", "grádus[case=acc][num=sg]",
         "grádus+om[case=ins][num=sg]", "grádus+e[case=prp][num=sg]",
         "grádus+y[case=nom][num=pl]", "grádus+ov[case=gen][num=pl]",
         "grádus+am[case=dat][num=pl]", "grádus+y[case=acc][num=pl]",
         "grádus+ami[case=ins][num=pl]", "grádus+ax[case=prp][num=pl]"
     ], forms)
     generator = (self.paradigm_b.stems_to_forms
                  @ self.paradigm_b.feature_label_rewriter)
     forms = rewrite.rewrites("stól", generator)
     self.assertSameElements([
         "stól[case=nom][num=sg]", "stol+á[case=gen][num=sg]",
         "stol+ú[case=dat][num=sg]", "stól[case=acc][num=sg]",
         "stol+óm[case=ins][num=sg]", "stol+é[case=prp][num=sg]",
         "stol+óv[case=gen][num=pl]", "stol+ý[case=acc][num=pl]",
         "stol+ý[case=nom][num=pl]", "stol+ám[case=dat][num=pl]",
         "stol+ámi[case=ins][num=pl]", "stol+áx[case=prp][num=pl]"
     ], forms)
Example #19
0
 def get_verbalized_text(tagged_text):
     tagged_text = pynini.escape(tagged_text)
     return rewrite.rewrites(tagged_text, self.verbalizer.fst)
Example #20
0
 def testAnalyzer(self):
     analyzer = self.paradigm.analyzer @ self.paradigm.feature_label_rewriter
     forms = rewrite.rewrites("ōs", analyzer)
     self.assertSameElements(["ōs+[case=nom][num=sg]"], forms)
     forms = rewrite.rewrites("rēge", analyzer)
     self.assertSameElements(["rēg+e[case=abl][num=sg]"], forms)
Example #21
0
 def get_verbalized_text(tagged_text):
     return rewrite.rewrites(tagged_text, self.verbalizer.fst)
Example #22
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """

        assert (
            len(text.split()) < 500
        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
        original_text = text
        text = pre_process(text)  # to handle []

        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if self.lm:
            if self.lang not in ["en"]:
                raise ValueError(f"{self.lang} is not supported in LM mode")

            if self.lang == "en":
                try:
                    lattice = rewrite.rewrite_lattice(
                        text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    lattice = rewrite.rewrite_lattice(text, self.tagger.fst)
                lattice = rewrite.lattice_to_nshortest(lattice, n_tagged)
                tagged_texts = [(x[1], float(x[2]))
                                for x in lattice.paths().items()]
                tagged_texts.sort(key=lambda x: x[1])
                tagged_texts, weights = list(zip(*tagged_texts))
        else:
            if n_tagged == -1:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.rewrites(
                            text, self.tagger.fst_no_digits)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.rewrites(text, self.tagger.fst)
                else:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.top_rewrites(
                            text,
                            self.tagger.fst_no_digits,
                            nshortest=n_tagged)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.top_rewrites(text,
                                                            self.tagger.fst,
                                                            nshortest=n_tagged)
                else:
                    tagged_texts = rewrite.top_rewrites(text,
                                                        self.tagger.fst,
                                                        nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t)
                    for t in normalized_texts
                ]

        if self.lm:
            return normalized_texts, weights

        normalized_texts = set(normalized_texts)
        return normalized_texts
Example #23
0
    def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        original_text = text

        if self.lang == "en":
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if n_tagged == -1:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)
            else:
                tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts
                ]
            else:
                print("NEMO_NLP collection is not available: skipping punctuation post_processing")

        normalized_texts = set(normalized_texts)
        return normalized_texts
Example #24
0
 def testTagger(self):
     tagger = self.paradigm.tagger @ self.paradigm.feature_label_rewriter
     forms = rewrite.rewrites("ōs", tagger)
     self.assertSameElements(["ōs[case=nom][num=sg]"], forms)
     forms = rewrite.rewrites("rēge", tagger)
     self.assertSameElements(["rēge[case=abl][num=sg]"], forms)