Exemple #1
0
class TestDate:
    inverse_normalizer = InverseNormalizer() if PYNINI_AVAILABLE else None

    @parameterized.expand(
        parse_test_case_file(
            'data_inverse_text_normalization/test_cases_date.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input,
                                                         verbose=False)
        assert pred == expected

    normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None
    normalizer_with_audio = NormalizerWithAudio(
        input_case='cased') if PYNINI_AVAILABLE else None

    @parameterized.expand(
        parse_test_case_file('data_text_normalization/test_cases_date.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_uncased(self, test_input, expected):
        pred = self.normalizer.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio.normalize(
            test_input, n_tagged=100)
        assert expected in pred_non_deterministic

    normalizer_uppercased = Normalizer(
        input_case='cased') if PYNINI_AVAILABLE else None
    cases_uppercased = {
        "Aug. 8": "august eighth",
        "8 Aug.": "the eighth of august",
        "aug. 8": "august eighth"
    }

    @parameterized.expand(cases_uppercased.items())
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_cased(self, test_input, expected):
        pred = self.normalizer_uppercased.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio.normalize(
            test_input, n_tagged=100)
        assert expected in pred_non_deterministic
Exemple #2
0
class TestCardinal:
    inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None

    @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False)
        assert pred == expected

    normalizer_en = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None
    normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None

    @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_cardinal.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer_en.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100)
        assert expected in pred_non_deterministic
Exemple #3
0
class TestWord:
    inverse_normalizer = InverseNormalizer()

    @parameterized.expand(
        parse_test_case_file(
            'data_inverse_text_normalization/test_cases_word.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input,
                                                         verbose=False)
        assert pred == expected

    normalizer = Normalizer(input_case='lower_cased')

    @parameterized.expand(
        parse_test_case_file('data_text_normalization/test_cases_word.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer.normalize(test_input, verbose=False)
        assert pred == expected
Exemple #4
0
class TestCardinal:
    inverse_normalizer_en = (InverseNormalizer(
        lang='en', cache_dir=CACHE_DIR, overwrite_cache=False)
                             if PYNINI_AVAILABLE else None)

    @parameterized.expand(
        parse_test_case_file(
            'en/data_inverse_text_normalization/test_cases_cardinal.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer_en.inverse_normalize(test_input,
                                                            verbose=False)
        assert pred == expected

    normalizer_en = (Normalizer(input_case='cased',
                                lang='en',
                                cache_dir=CACHE_DIR,
                                overwrite_cache=False,
                                post_process=True)
                     if PYNINI_AVAILABLE else None)
    normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased',
                                                    lang='en',
                                                    cache_dir=CACHE_DIR,
                                                    overwrite_cache=False)
                                if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS
                                else None)

    @parameterized.expand(
        parse_test_case_file(
            'en/data_text_normalization/test_cases_cardinal.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer_en.normalize(test_input,
                                            verbose=False,
                                            punct_post_process=False)
        assert pred == expected, f"input: {test_input}"

        if self.normalizer_with_audio_en:
            pred_non_deterministic = self.normalizer_with_audio_en.normalize(
                test_input,
                n_tagged=30,
                punct_post_process=False,
            )
            assert expected in pred_non_deterministic, f"input: {test_input}"
Exemple #5
0
class TestTelephone:
    inverse_normalizer = InverseNormalizer(lang='de') if PYNINI_AVAILABLE else None

    @parameterized.expand(parse_test_case_file('de/data_inverse_text_normalization/test_cases_telephone.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
        assert pred == expected
Exemple #6
0
class TestMeasure:
    inverse_normalizer = (InverseNormalizer(
        lang='de', cache_dir=CACHE_DIR, overwrite_cache=False)
                          if PYNINI_AVAILABLE else None)

    @parameterized.expand(
        parse_test_case_file(
            'de/data_inverse_text_normalization/test_cases_measure.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input,
                                                         verbose=False)
        assert pred == expected

    normalizer = (Normalizer(input_case='cased',
                             lang='de',
                             cache_dir=CACHE_DIR,
                             overwrite_cache=False)
                  if PYNINI_AVAILABLE else None)

    normalizer_with_audio = (NormalizerWithAudio(input_case='cased',
                                                 lang='de',
                                                 cache_dir=CACHE_DIR,
                                                 overwrite_cache=False)
                             if PYNINI_AVAILABLE and CACHE_DIR else None)

    @parameterized.expand(
        parse_test_case_file(
            'de/data_text_normalization/test_cases_measure.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, expected, test_input):
        pred = self.normalizer.normalize(test_input, verbose=False)
        assert pred == expected

        if self.normalizer_with_audio:
            pred_non_deterministic = self.normalizer_with_audio.normalize(
                test_input, n_tagged=1000, punct_post_process=False)
            assert expected in pred_non_deterministic
Exemple #7
0
class TestOrdinal:
    inverse_normalizer = (
        InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
    )

    @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_ordinal.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
        assert pred == expected
Exemple #8
0
class TestWhitelist:
    inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None

    @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_whitelist.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False)
        assert pred == expected

    normalizer_en = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None
    normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None

    @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_whitelist.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer_en.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100)
        assert expected in pred_non_deterministic

    normalizer_uppercased = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None
    cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"}

    @parameterized.expand(cases_uppercased.items())
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_cased(self, test_input, expected):
        pred = self.normalizer_uppercased.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100)
        assert expected in pred_non_deterministic
Exemple #9
0
class TestElectronic:
    inverse_normalizer = InverseNormalizer() if PYNINI_AVAILABLE else None

    @parameterized.expand(
        parse_test_case_file(
            'data_inverse_text_normalization/test_cases_electronic.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input,
                                                         verbose=False)
        assert pred == expected

    normalizer = Normalizer(input_case="cased") if PYNINI_AVAILABLE else None
    normalizer_with_audio = NormalizerWithAudio(
        input_case='cased') if PYNINI_AVAILABLE else None

    @parameterized.expand(
        parse_test_case_file(
            'data_text_normalization/test_cases_electronic.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer.normalize(test_input, verbose=False)
        assert pred == expected
        pred_non_deterministic = self.normalizer_with_audio.normalize(
            test_input, n_tagged=100, punct_post_process=False)
        assert expected in pred_non_deterministic
Exemple #10
0
class TestWhitelist:
    inverse_normalizer = InverseNormalizer()

    @parameterized.expand(parse_test_case_file('data_inverse_text_normalization/test_cases_whitelist.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
        assert pred == expected

    normalizer = Normalizer(input_case='lower_cased')

    @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_whitelist.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm(self, test_input, expected):
        pred = self.normalizer.normalize(test_input, verbose=False)
        assert pred == expected

    normalizer_uppercased = Normalizer(input_case='cased')
    cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"}

    @parameterized.expand(cases_uppercased.items())
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_cased(self, test_input, expected):
        pred = self.normalizer_uppercased.normalize(test_input, verbose=False)
        assert pred == expected
        file_path: file path
        data: list of string
        
    """
    with open(file_path, 'w') as fp:
        for line in data:
            fp.write(line + '\n')


def parse_args():
    parser = ArgumentParser()
    parser.add_argument("--input", help="input file path", required=True, type=str)
    parser.add_argument("--language", help="language", choices=['en', 'de','ar'], default="en", type=str)
    parser.add_argument("--output", help="output file path", required=True, type=str)
    parser.add_argument("--verbose", help="print denormalization info. For debugging", action='store_true')
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    file_path = args.input
    inverse_normalizer = InverseNormalizer(lang=args.language)

    print("Loading data: " + file_path)
    data = load_file(file_path)

    print("- Data: " + str(len(data)) + " sentences")
    inverse_normalizer_prediction = inverse_normalizer.inverse_normalize_list(data, verbose=args.verbose)
    write_file(args.output, inverse_normalizer_prediction)
    print(f"- Denormalized. Writing out to {args.output}")
Exemple #12
0
class TestDate:
    inverse_normalizer_en = (InverseNormalizer(
        lang='en', cache_dir=CACHE_DIR, overwrite_cache=False)
                             if PYNINI_AVAILABLE else None)

    @parameterized.expand(
        parse_test_case_file(
            'en/data_inverse_text_normalization/test_cases_date.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_denorm(self, test_input, expected):
        pred = self.inverse_normalizer_en.inverse_normalize(test_input,
                                                            verbose=False)
        assert pred == expected

    normalizer_en = (Normalizer(input_case='cased',
                                lang='en',
                                cache_dir=CACHE_DIR,
                                overwrite_cache=False,
                                post_process=True)
                     if PYNINI_AVAILABLE else None)
    normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased',
                                                    lang='en',
                                                    cache_dir=CACHE_DIR,
                                                    overwrite_cache=False)
                                if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS
                                else None)

    @parameterized.expand(
        parse_test_case_file('en/data_text_normalization/test_cases_date.txt'))
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_uncased(self, test_input, expected):
        pred = self.normalizer_en.normalize(test_input, verbose=False)
        assert pred == expected

        if self.normalizer_with_audio_en:
            pred_non_deterministic = self.normalizer_with_audio_en.normalize(
                test_input, punct_post_process=False, n_tagged=100)
            assert expected in pred_non_deterministic, f"INPUT: {test_input}"

    normalizer_uppercased = (Normalizer(input_case='cased',
                                        lang='en',
                                        cache_dir=CACHE_DIR,
                                        overwrite_cache=False)
                             if PYNINI_AVAILABLE else None)
    cases_uppercased = {
        "Aug. 8": "august eighth",
        "8 Aug.": "the eighth of august",
        "aug. 8": "august eighth"
    }

    @parameterized.expand(cases_uppercased.items())
    @pytest.mark.skipif(
        not PYNINI_AVAILABLE,
        reason=
        "`pynini` not installed, please install via nemo_text_processing/setup.sh"
    )
    @pytest.mark.run_only_on('CPU')
    @pytest.mark.unit
    def test_norm_cased(self, test_input, expected):
        pred = self.normalizer_uppercased.normalize(test_input, verbose=False)
        assert pred == expected

        if self.normalizer_with_audio_en:
            pred_non_deterministic = self.normalizer_with_audio_en.normalize(
                test_input, punct_post_process=False, n_tagged=30)
            assert expected in pred_non_deterministic
Exemple #13
0
        choices=known_types,
    )
    parser.add_argument("--filter",
                        action='store_true',
                        help="clean data for inverse normalization purposes")
    return parser.parse_args()


if __name__ == "__main__":
    # Example usage:
    # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter
    args = parse_args()
    if args.lang == 'en':
        from nemo_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data
    file_path = args.input
    inverse_normalizer = InverseNormalizer()

    print("Loading training data: " + file_path)
    training_data = load_files([file_path])

    if args.filter:
        training_data = filter_loaded_data(training_data)

    if args.category is None:
        print("Sentence level evaluation...")
        sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(
            training_data)
        print("- Data: " + str(len(sentences_normalized)) + " sentences")
        sentences_prediction = inverse_normalizer.inverse_normalize_list(
            sentences_normalized)
        print("- Denormalized. Evaluating...")
Exemple #14
0
def main(args):

    inverse_normalizer = InverseNormalizer(lang='en')

    raw_text = "we paid one hundred and twenty three dollars for this desk, and this."
    inverse_normalizer.inverse_normalize(raw_text, verbose=False)