Exemple #1
0
def inverse_normalize(text: str, verbose: bool) -> str:
    """
    main function. normalizes spoken tokens in given text to its written form
        e.g. twelve kilograms -> 12 kg

    Args:
        text: string that may include semiotic classes.

    Returns: written form
    """

    text = pynini.escape(text)
    tagged_lattice = find_tags(text)
    tagged_text = select_tag(tagged_lattice)
    parser(tagged_text)
    tokens = parser.parse()
    tags_reordered = generate_permutations(tokens)
    for tagged_text in tags_reordered:
        tagged_text = pynini.escape(tagged_text)
        verbalizer_lattice = find_verbalizer(tagged_text)
        if verbalizer_lattice.num_states() == 0:
            continue
        output = select_verbalizer(verbalizer_lattice)
        if verbose:
            print(output)
        return output
    raise ValueError()
Exemple #2
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
        graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )

        if not deterministic:
            phoneme = (
                pynini.accep(pynini.escape("["))
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.closure(phoneme_unit + pynini.accep(" "))
                + phoneme_unit
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.accep(pynini.escape("]"))
            )
        self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #3
0
    def normalize(self, text: str, verbose: bool) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            verbose: whether to print intermediate meta information

        Returns: spoken form
        """
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)
        tagged_lattice = self.find_tags(text)
        tagged_text = self.select_tag(tagged_lattice)
        if verbose:
            print(tagged_text)
        self.parser(tagged_text)
        tokens = self.parser.parse()
        tags_reordered = self.generate_permutations(tokens)
        for tagged_text in tags_reordered:
            tagged_text = pynini.escape(tagged_text)
            verbalizer_lattice = self.find_verbalizer(tagged_text)
            if verbalizer_lattice.num_states() == 0:
                continue
            output = self.select_verbalizer(verbalizer_lattice)
            return output
        raise ValueError()
Exemple #4
0
    def normalize(self,
                  text: str,
                  verbose: bool = False,
                  punct_pre_process: bool = False,
                  punct_post_process: bool = False) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            verbose: whether to print intermediate meta information
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation

        Returns: spoken form
        """
        original_text = text
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)
        tagged_lattice = self.find_tags(text)
        tagged_text = self.select_tag(tagged_lattice)
        if verbose:
            print(tagged_text)
        self.parser(tagged_text)
        tokens = self.parser.parse()
        tags_reordered = self.generate_permutations(tokens)
        for tagged_text in tags_reordered:
            tagged_text = pynini.escape(tagged_text)

            verbalizer_lattice = self.find_verbalizer(tagged_text)
            if verbalizer_lattice.num_states() == 0:
                continue
            output = self.select_verbalizer(verbalizer_lattice)
            if punct_post_process:
                # do post-processing based on Moses detokenizer
                if self.processor:
                    output = self.processor.moses_detokenizer.detokenize(
                        [output], unescape=False)
                    output = post_process_punct(input=original_text,
                                                normalized_text=output)
                else:
                    print(
                        "NEMO_NLP collection is not available: skipping punctuation post_processing"
                    )
            return output
        raise ValueError()
    def normalize_with_audio(self, text: str, verbose: bool = False) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            transcript: transcription of the audio
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        def get_tagged_texts(text):
            tagged_lattice = self.find_tags(text)
            tagged_texts = self.select_all_semiotic_tags(tagged_lattice)
            return tagged_texts

        tagged_texts = set(get_tagged_texts(text))
        normalized_texts = []

        for tagged_text in tagged_texts:
            self.parser(tagged_text)
            tokens = self.parser.parse()
            tags_reordered = self.generate_permutations(tokens)
            for tagged_text_reordered in tags_reordered:
                tagged_text_reordered = pynini.escape(tagged_text_reordered)

                verbalizer_lattice = self.find_verbalizer(tagged_text_reordered)
                if verbalizer_lattice.num_states() == 0:
                    continue

                verbalized = self.get_all_verbalizers(verbalizer_lattice)
                for verbalized_option in verbalized:
                    normalized_texts.append(verbalized_option)

        if len(normalized_texts) == 0:
            raise ValueError()

        normalized_texts = [post_process(t) for t in normalized_texts]
        normalized_texts = set(normalized_texts)
        return normalized_texts
Exemple #6
0
    def _verbalize(self,
                   tagged_text: str,
                   normalized_texts: List[str],
                   verbose: bool = False):
        """
        Verbalizes tagged text

        Args:
            tagged_text: text with tags
            normalized_texts: list of possible normalization options
            verbose: if true prints intermediate classification results
        """
        def get_verbalized_text(tagged_text):
            return rewrite.rewrites(tagged_text, self.verbalizer.fst)

        self.parser(tagged_text)
        tokens = self.parser.parse()
        tags_reordered = self.generate_permutations(tokens)
        for tagged_text_reordered in tags_reordered:
            try:
                tagged_text_reordered = pynini.escape(tagged_text_reordered)
                normalized_texts.extend(
                    get_verbalized_text(tagged_text_reordered))
                if verbose:
                    print(tagged_text_reordered)

            except pynini.lib.rewrite.Error:
                continue
Exemple #7
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_pre_process: bool = True,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text

        text = pynini.escape(text)

        if n_tagged == -1:
            tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            tagged_texts = rewrite.top_rewrites(text,
                                                self.tagger.fst,
                                                nshortest=n_tagged)
        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == 'en':
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts)

        if len(normalized_texts) == 0:
            raise ValueError()
        if punct_post_process:
            normalized_texts = [
                post_process_punctuation(t) for t in normalized_texts
            ]

            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
        normalized_texts = set(normalized_texts)
        return normalized_texts
Exemple #8
0
 def testVerifyAsciiDefinition(self):
     ascii_char = pynini.string_map(
         # UTF-8 ASCII uses the all single byte characters with most
         # significant bit set to 0, barring NUL, which we ignore.
         pynini.escape(chr(codepoint))
         for codepoint in range(1, 128)).optimize()
     self.assertFsasEquivalent(ascii_char, utf8.SINGLE_BYTE)
Exemple #9
0
 def testVerifyUtf8CharRegionalIndicatorSymbolDefinition(self):
     regional_indicator = pynini.string_map(
         # Regional indicator symbols have codepoints in the range 0x1F1E6
         # through 0x1F1FF.
         pynini.escape(chr(codepoint))
         for codepoint in range(0x1F1E6, 0x1F1FF + 1)).optimize()
     self.assertFsasEquivalent(
         regional_indicator, utf8.VALID_UTF8_CHAR_REGIONAL_INDICATOR_SYMBOL)
Exemple #10
0
 def testVerifyUtf8Rfc3629Definition(self):
     utf8_rfc3629_char = pynini.string_map(
         # UTF-8 encoded strings can store codepoints in U+0000 through
         # U+0x10FFFF, excluding the surrogate halves in U+D800 through
         # U+DFFF, but we exclude U+0000 as it would be strange to match NUL
         # and that label is reserved for epsilon.
         pynini.escape(chr(codepoint))
         for codepoint in range(1, 0x10FFFF + 1)
         if not 0xD800 <= codepoint <= 0xDFFF).optimize()
     self.assertFsasEquivalent(utf8_rfc3629_char, utf8.VALID_UTF8_CHAR)
Exemple #11
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1)
                ),
                1,
            )

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )
        self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemple #12
0
  def _make_feature_mapper(self) -> pynini.Fst:
    r"""Convenience function generating a map to human-readable strings.

    Returns:
      A transducer that maps from internal symbols like "[case=nom]" to a
      sequence that will be readable as a string ("\[case=nom\]") for all
      feature-value combinations.
    """
    pairs = []
    for feature in self._features:
      name = feature.name
      for value in feature.values:
        f = f"[{name}={value}]"
        v = pynini.escape(f"[{name}={value}]")
        pairs.append(pynini.cross(f, v))
    return pynini.union(*pairs).closure().optimize()
Exemple #13
0
    def post_process(self, normalized_text: 'pynini.FstLike') -> str:
        """
        Runs post processing graph on normalized text

        Args:
            normalized_text: normalized text

        Returns: shortest path
        """
        normalized_text = normalized_text.strip()
        if not normalized_text:
            return normalized_text
        normalized_text = pynini.escape(normalized_text)

        if self.post_processor is not None:
            normalized_text = top_rewrite(normalized_text,
                                          self.post_processor.fst)
        return normalized_text
Exemple #14
0
        def ApplyOnText(self, text: str) -> str:
            """Transduce the given string using the FST.

      Args:
        text: Input string to be transduced.

      Returns:
        Transduced string output.

      Raises:
        ValueError on Pynini string compilation exceptions.

      This operation involves pre-composing the input string with the FST and
      then finding the shortest path to output a resultant string.
      """
            try:
                # Square brackets and backslash carry special meaning in Pynini.
                # So they need to be escaped for unmanaged strings.
                return pynini.shortestpath(
                    pynini.escape(text) @ self._fst).string()
            except pynini.FstOpError as error:
                raise ValueError(
                    f'{error} on the string (between quotes): `{text}`')
Exemple #15
0
    def normalize(
        self,
        text: str,
        n_tagged: int,
        punct_post_process: bool = True,
        verbose: bool = False,
    ) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """

        assert (
            len(text.split()) < 500
        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
        original_text = text
        text = pre_process(text)  # to handle []

        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if self.lm:
            if self.lang not in ["en"]:
                raise ValueError(f"{self.lang} is not supported in LM mode")

            if self.lang == "en":
                try:
                    lattice = rewrite.rewrite_lattice(
                        text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    lattice = rewrite.rewrite_lattice(text, self.tagger.fst)
                lattice = rewrite.lattice_to_nshortest(lattice, n_tagged)
                tagged_texts = [(x[1], float(x[2]))
                                for x in lattice.paths().items()]
                tagged_texts.sort(key=lambda x: x[1])
                tagged_texts, weights = list(zip(*tagged_texts))
        else:
            if n_tagged == -1:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.rewrites(
                            text, self.tagger.fst_no_digits)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.rewrites(text, self.tagger.fst)
                else:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                if self.lang == "en":
                    try:
                        tagged_texts = rewrite.top_rewrites(
                            text,
                            self.tagger.fst_no_digits,
                            nshortest=n_tagged)
                    except pynini.lib.rewrite.Error:
                        tagged_texts = rewrite.top_rewrites(text,
                                                            self.tagger.fst,
                                                            nshortest=n_tagged)
                else:
                    tagged_texts = rewrite.top_rewrites(text,
                                                        self.tagger.fst,
                                                        nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [
                    self.processor.detokenize([t]) for t in normalized_texts
                ]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t)
                    for t in normalized_texts
                ]

        if self.lm:
            return normalized_texts, weights

        normalized_texts = set(normalized_texts)
        return normalized_texts
Exemple #16
0
    def normalize(self,
                  text: str,
                  verbose: bool = False,
                  punct_pre_process: bool = False,
                  punct_post_process: bool = False) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            verbose: whether to print intermediate meta information
            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
            punct_post_process: whether to normalize punctuation

        Returns: spoken form
        """
        assert (
            len(text.split()) < 500
        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"

        original_text = text
        if punct_pre_process:
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)
        tagged_lattice = self.find_tags(text)
        tagged_text = self.select_tag(tagged_lattice)
        if verbose:
            print(tagged_text)
        self.parser(tagged_text)
        tokens = self.parser.parse()
        split_tokens = self._split_tokens_to_reduce_number_of_permutations(
            tokens)
        output = ""
        for s in split_tokens:
            tags_reordered = self.generate_permutations(s)
            verbalizer_lattice = None
            for tagged_text in tags_reordered:
                tagged_text = pynini.escape(tagged_text)

                verbalizer_lattice = self.find_verbalizer(tagged_text)
                if verbalizer_lattice.num_states() != 0:
                    break
            if verbalizer_lattice is None:
                raise ValueError(
                    f"No permutations were generated from tokens {s}")
            output += ' ' + self.select_verbalizer(verbalizer_lattice)
        output = SPACE_DUP.sub(' ', output[1:])

        if self.lang == "en" and hasattr(self, 'post_processor'):
            output = self.post_process(output)

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                output = self.processor.moses_detokenizer.detokenize(
                    [output], unescape=False)
                output = post_process_punct(input=original_text,
                                            normalized_text=output)
            else:
                print(
                    "NEMO_NLP collection is not available: skipping punctuation post_processing"
                )

        return output
Exemple #17
0
 def testPunct(self) -> None:
     for s in string.punctuation:
         self.assertAccepts(pynini.escape(s), byte.PUNCT)
    def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str:
        """
        Main function. Normalizes tokens from written to spoken form
            e.g. 12 kg -> twelve kilograms

        Args:
            text: string that may include semiotic classes
            n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
            punct_post_process: whether to normalize punctuation
            verbose: whether to print intermediate meta information

        Returns:
            normalized text options (usually there are multiple ways of normalizing a given semiotic class)
        """
        original_text = text

        if self.lang == "en":
            text = pre_process(text)
        text = text.strip()
        if not text:
            if verbose:
                print(text)
            return text
        text = pynini.escape(text)

        if n_tagged == -1:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.rewrites(text, self.tagger.fst)
            else:
                tagged_texts = rewrite.rewrites(text, self.tagger.fst)
        else:
            if self.lang == "en":
                try:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged)
                except pynini.lib.rewrite.Error:
                    tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)
            else:
                tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged)

        # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between
        if self.lang == "en":
            normalized_texts = tagged_texts
        else:
            normalized_texts = []
            for tagged_text in tagged_texts:
                self._verbalize(tagged_text, normalized_texts, verbose=verbose)

        if len(normalized_texts) == 0:
            raise ValueError()

        if punct_post_process:
            # do post-processing based on Moses detokenizer
            if self.processor:
                normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts]
                normalized_texts = [
                    post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts
                ]
            else:
                print("NEMO_NLP collection is not available: skipping punctuation post_processing")

        normalized_texts = set(normalized_texts)
        return normalized_texts
Exemple #19
0
 def get_verbalized_text(tagged_text):
     tagged_text = pynini.escape(tagged_text)
     return rewrite.rewrites(tagged_text, self.verbalizer.fst)
Exemple #20
0
 def testAsciiBytes(self) -> None:
     for char in range(1, 128):
         self.assertAccepts(pynini.escape(chr(char)), byte.BYTE)