Beispiel #1
0
 def _get_tagged_text(self, text, n_tagged):
     """
     Returns text after tokenize and classify
     Args;
         text: input  text
         n_tagged: number of tagged options to consider, -1 - return all possible tagged options
     """
     if n_tagged == -1:
         if self.lang == "en":
             # this to keep arpabet phonemes in the list of options
             if "[" in text and "]" in text:
                 tagged_texts = rewrite.rewrites(text, self.tagger.fst)
             else:
                 try:
                     tagged_texts = rewrite.rewrites(
                         text, self.tagger.fst_no_digits)
                 except pynini.lib.rewrite.Error:
                     tagged_texts = rewrite.rewrites(text, self.tagger.fst)
         else:
             tagged_texts = rewrite.rewrites(text, self.tagger.fst)
     else:
         if self.lang == "en":
             # this to keep arpabet phonemes in the list of options
             if "[" in text and "]" in text:
                 tagged_texts = rewrite.top_rewrites(text,
                                                     self.tagger.fst,
                                                     nshortest=n_tagged)
             else:
                 try:
                     # try self.tagger graph that produces output without digits
                     tagged_texts = rewrite.top_rewrites(
                         text,
                         self.tagger.fst_no_digits,
                         nshortest=n_tagged)
                 except pynini.lib.rewrite.Error:
                     tagged_texts = rewrite.top_rewrites(text,
                                                         self.tagger.fst,
                                                         nshortest=n_tagged)
         else:
             tagged_texts = rewrite.top_rewrites(text,
                                                 self.tagger.fst,
                                                 nshortest=n_tagged)
     return tagged_texts
Beispiel #2
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """

        assert (
            len(text.split()) < 500
        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
        original_text = text
        text = pre_process(text)  # to handle []

        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if self.lm:
            if self.lang not in ["en"]:
                raise ValueError(f"{self.lang} is not supported in LM mode")

            if self.lang == "en":
                try:
                    lattice = rewrite.rewrite_lattice(
                        text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    lattice = rewrite.rewrite_lattice(text, self.tagger.fst)
                lattice = rewrite.lattice_to_nshortest(lattice, n_tagged)
                tagged_texts = [(x[1], float(x[2]))
                                for x in lattice.paths().items()]
                tagged_texts.sort(key=lambda x: x[1])
                tagged_texts, weights = list(zip(*tagged_texts))
        else:
            if n_tagged == -1:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.rewrites(
                            text, self.tagger.fst_no_digits)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.rewrites(text, self.tagger.fst)
                else:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.top_rewrites(
                            text,
                            self.tagger.fst_no_digits,
                            nshortest=n_tagged)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.top_rewrites(text,
                                                            self.tagger.fst,
                                                            nshortest=n_tagged)
                else:
                    tagged_texts = rewrite.top_rewrites(text,
                                                        self.tagger.fst,
                                                        nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t)
                    for t in normalized_texts
                ]

        if self.lm:
            return normalized_texts, weights

        normalized_texts = set(normalized_texts)
        return normalized_texts
    def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        original_text = text

        if self.lang == "en":
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if n_tagged == -1:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)
            else:
                tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts
                ]
            else:
                print("NEMO_NLP collection is not available: skipping punctuation post_processing")

        normalized_texts = set(normalized_texts)
        return normalized_texts