Example #1
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_pre_process: bool = True,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text

        text = pynini.escape(text)

        if n_tagged == -1:
            tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            tagged_texts = rewrite.top_rewrites(text,
                                                self.tagger.fst,
                                                nshortest=n_tagged)
        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == 'en':
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts)

        if len(normalized_texts) == 0:
            raise ValueError()
        if punct_post_process:
            normalized_texts = [
                post_process_punctuation(t) for t in normalized_texts
            ]

            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
        normalized_texts = set(normalized_texts)
        return normalized_texts
Example #2
0
    def normalize(self,
                  text: str,
                  verbose: bool = False,
                  punct_pre_process: bool = False,
                  punct_post_process: bool = False) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            verbose: whether to print intermediate meta information
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation

        Returns: spoken form
        """
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)
        tagged_lattice = self.find_tags(text)
        tagged_text = self.select_tag(tagged_lattice)
        if verbose:
            print(tagged_text)
        self.parser(tagged_text)
        tokens = self.parser.parse()
        tags_reordered = self.generate_permutations(tokens)
        for tagged_text in tags_reordered:
            tagged_text = pynini.escape(tagged_text)

            verbalizer_lattice = self.find_verbalizer(tagged_text)
            if verbalizer_lattice.num_states() == 0:
                continue
            output = self.select_verbalizer(verbalizer_lattice)
            if punct_post_process:
                output = post_process_punctuation(output)
                # do post-processing based on Moses detokenizer
                if self.processor:
                    output = self.processor.detokenize([output])
            return output
        raise ValueError()