class TestDate: inverse_normalizer = InverseNormalizer() if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file( 'data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None normalizer_with_audio = NormalizerWithAudio( input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file('data_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_uncased(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer( input_case='cased') if PYNINI_AVAILABLE else None cases_uppercased = { "Aug. 8": "august eighth", "8 Aug.": "the eighth of august", "aug. 8": "august eighth" } @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100) assert expected in pred_non_deterministic
class TestCardinal: inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic
class TestWord: inverse_normalizer = InverseNormalizer() @parameterized.expand( parse_test_case_file( 'data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='lower_cased') @parameterized.expand( parse_test_case_file('data_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected
class TestCardinal: inverse_normalizer_en = (InverseNormalizer( lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) @parameterized.expand( parse_test_case_file( 'en/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) @parameterized.expand( parse_test_case_file( 'en/data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False, punct_post_process=False) assert pred == expected, f"input: {test_input}" if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, n_tagged=30, punct_post_process=False, ) assert expected in pred_non_deterministic, f"input: {test_input}"
class TestTelephone: inverse_normalizer = InverseNormalizer(lang='de') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('de/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected
class TestMeasure: inverse_normalizer = (InverseNormalizer( lang='de', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) @parameterized.expand( parse_test_case_file( 'de/data_inverse_text_normalization/test_cases_measure.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = (Normalizer(input_case='cased', lang='de', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) normalizer_with_audio = (NormalizerWithAudio(input_case='cased', lang='de', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and CACHE_DIR else None) @parameterized.expand( parse_test_case_file( 'de/data_text_normalization/test_cases_measure.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, expected, test_input): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio: pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=1000, punct_post_process=False) assert expected in pred_non_deterministic
class TestOrdinal: inverse_normalizer = ( InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None ) @parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_ordinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected
class TestWhitelist: inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"} @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic
class TestElectronic: inverse_normalizer = InverseNormalizer() if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file( 'data_inverse_text_normalization/test_cases_electronic.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case="cased") if PYNINI_AVAILABLE else None normalizer_with_audio = NormalizerWithAudio( input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file( 'data_text_normalization/test_cases_electronic.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100, punct_post_process=False) assert expected in pred_non_deterministic
class TestWhitelist: inverse_normalizer = InverseNormalizer() @parameterized.expand(parse_test_case_file('data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='lower_cased') @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected normalizer_uppercased = Normalizer(input_case='cased') cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"} @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected
file_path: file path data: list of string """ with open(file_path, 'w') as fp: for line in data: fp.write(line + '\n') def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", required=True, type=str) parser.add_argument("--language", help="language", choices=['en', 'de','ar'], default="en", type=str) parser.add_argument("--output", help="output file path", required=True, type=str) parser.add_argument("--verbose", help="print denormalization info. For debugging", action='store_true') return parser.parse_args() if __name__ == "__main__": args = parse_args() file_path = args.input inverse_normalizer = InverseNormalizer(lang=args.language) print("Loading data: " + file_path) data = load_file(file_path) print("- Data: " + str(len(data)) + " sentences") inverse_normalizer_prediction = inverse_normalizer.inverse_normalize_list(data, verbose=args.verbose) write_file(args.output, inverse_normalizer_prediction) print(f"- Denormalized. Writing out to {args.output}")
class TestDate: inverse_normalizer_en = (InverseNormalizer( lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) @parameterized.expand( parse_test_case_file( 'en/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) @parameterized.expand( parse_test_case_file('en/data_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_uncased(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, punct_post_process=False, n_tagged=100) assert expected in pred_non_deterministic, f"INPUT: {test_input}" normalizer_uppercased = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) cases_uppercased = { "Aug. 8": "august eighth", "8 Aug.": "the eighth of august", "aug. 8": "august eighth" } @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, punct_post_process=False, n_tagged=30) assert expected in pred_non_deterministic
choices=known_types, ) parser.add_argument("--filter", action='store_true', help="clean data for inverse normalization purposes") return parser.parse_args() if __name__ == "__main__": # Example usage: # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter args = parse_args() if args.lang == 'en': from nemo_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data file_path = args.input inverse_normalizer = InverseNormalizer() print("Loading training data: " + file_path) training_data = load_files([file_path]) if args.filter: training_data = filter_loaded_data(training_data) if args.category is None: print("Sentence level evaluation...") sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences( training_data) print("- Data: " + str(len(sentences_normalized)) + " sentences") sentences_prediction = inverse_normalizer.inverse_normalize_list( sentences_normalized) print("- Denormalized. Evaluating...")
def main(args): inverse_normalizer = InverseNormalizer(lang='en') raw_text = "we paid one hundred and twenty three dollars for this desk, and this." inverse_normalizer.inverse_normalize(raw_text, verbose=False)