def test_apply_nlp_options_stop(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = 'thingy in the dancing' options = NLPOptions() model = NLPModel() model.set_remove_stop_words(True) result = options.apply_nlp_options(model, hyp) expected = ' thingy dancing' assert result == expected
def test_apply_nlp_options_n2w(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = '34 567' options = NLPOptions() model = NLPModel() model.set_n2w(True) result = options.apply_nlp_options(model, hyp) expected = ' thirty-four five hundred and sixty-seven' assert result == expected
def test_apply_nlp_options_expand_cont(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "isn't can't" options = NLPOptions() model = NLPModel() model.set_expand_contractions(True) result = options.apply_nlp_options(model, hyp) expected = 'is not cannot' assert result == expected
def test_apply_nlp_options_stem(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = 'thingy dancing' options = NLPOptions() model = NLPModel() model.set_apply_stemming(True) result = options.apply_nlp_options(model, hyp) expected = 'thingi danc ' assert result == expected
def test_stem_stop_n2w_exp(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "don't in the won't didn't isn't raining 3 6 9" options = NLPOptions() model = NLPModel() model.set_n2w(True) model.set_remove_stop_words(True) model.set_apply_stemming(True) model.set_expand_contractions(True) result = options.apply_nlp_options(model, hyp) expected = 'rain three six nine ' assert result == expected
def test_stem_stop_n2w(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "666 don't in the raining" options = NLPOptions() model = NLPModel() model.set_n2w(True) model.set_remove_stop_words(True) model.set_apply_stemming(True) result = options.apply_nlp_options(model, hyp) expected = 'six hundr sixtysix nt rain ' assert result == expected
def test_apply_nlp_options_stem_n2w_exp(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "666 don't in the raining" options = NLPOptions() model = NLPModel() model.set_n2w(True) model.set_expand_contractions(True) model.set_apply_stemming(True) result = options.apply_nlp_options(model, hyp) expected = 'six hundr and sixty-six do not in the rain ' assert result == expected
def test_apply_nlp_options_stop_n2w_exp(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "666 don't in the" options = NLPOptions() model = NLPModel() model.set_n2w(True) model.set_expand_contractions(True) model.set_remove_stop_words(True) result = options.apply_nlp_options(model, hyp) expected = ' six hundred sixtysix' assert result == expected
def test_create_unique_root_2(): from utilities.utilities import Utilities from model.configuration import Configuration from model.nlp import NLPModel u = Utilities() configuration = Configuration() nlp_model = NLPModel() root = '12345' configuration.set_model('video') configuration.set_use_enhanced(False) configuration.set_language_code('fr_FR') configuration.set_alternative_language_codes(['en-US', 'ru-RU']) configuration.set_speech_context('hi', 5) nlp_model.set_remove_stop_words(True) nlp_model.set_apply_stemming(False) nlp_model.set_expand_contractions(True) nlp_model.set_n2w(True) result = u.create_unique_root(root, configuration, nlp_model) expected = '12345_video_fr_FR_alts_applied_speech_adaptation_applied_boost_5_stop_words_removed_contractions_expanded_numbers_converted_2_words' assert result == expected
def test_update_csv(): from utilities.io_handler import IOHandler from model.configuration import Configuration from model.nlp import NLPModel import os configuration = Configuration() nlp_model = NLPModel() io = IOHandler() result_file_name = io._result_file_name io.set_result_path('test_results_path') io.write_csv_header() expected_uri = 'gs://foo/bar/baz/test.flac' expected_lang = 'fr-FR' nlp_model.set_apply_stemming(True) configuration.set_language_code(expected_lang) io.update_csv(expected_uri, configuration, nlp_model) full_path = f'{io.get_result_path()}/{result_file_name}' with open(full_path, 'r') as file: contents = file.read() os.remove(full_path) assert expected_uri in contents assert expected_lang in contents assert 'True' in contents
def test_apply_nlp_options_stem_exp(): from utilities.nlp_options import NLPOptions from model.nlp import NLPModel hyp = "thingy dancing won't" options = NLPOptions() model = NLPModel() model.set_apply_stemming(True) model.set_expand_contractions(True) result = options.apply_nlp_options(model, hyp) expected = 'thingi danc will not ' assert result == expected
class IOHandler(object): _result_path = '' _result_file_name = 'results.csv' _csv_header = 'AUDIO_FILE, MODEL, ENHANCED, LANGUAGE, ALTERNATIVE_LANGS, PHRASE_HINTS_APPLIED, BOOST, REF_WORD_COUNT, REF_ERROR_COUNT , WER,STEMMING_APPLIED , STOP_WORDS_REMOVED, NUMBER_TO_WORD_CONVERSION, CONTRACTIONS_EXPANDED, INSERTIONS, DELETIONS, SUBSTITUTIONS, DELETED_WORDS, INSERTED_WORDS, SUBSTITUTE_WORDS\n' _csv_header_written = False configuration = Configuration() nlp_model = NLPModel() _queue_file_name = 'queue.txt' def set_queue_file_name(self, name): self._queue_file_name = name def get_queue_file_name(self): return self._queue_file_name def set_result_path(self, result_path): self._result_path = result_path def get_result_path(self): return self._result_path def write_csv_header(self): import os if not self._csv_header_written: full_path = f'{self.get_result_path()}/{self._result_file_name}' # if path does not exists, make it if not os.path.exists(self.get_result_path()): os.makedirs(self.get_result_path()) with open(full_path, 'w') as file: try: file.write(self._csv_header) except IOError as i: print(f'Can not write csv header: {i}') except FileNotFoundError as x: print(f'Can not find csv file: {x}') self._csv_header_written = True def update_csv(self, uri, configuration, nlp_model, word_count_list=None, ref_total_word_count=0, ref_error_count=0, word_error_rate=0, ins=0, deletions=0, subs=0): import logging logging.basicConfig(filename='wer_app.log') logger = logging.getLogger(__name__) from collections import OrderedDict deleted_words_dict = dict() inserted_words_dict = dict() substitute_words_dict = dict() if word_count_list: try: deleted_words_dict = OrderedDict( sorted(word_count_list[0].items(), key=lambda x: x[1])) inserted_words_dict = OrderedDict( sorted(word_count_list[1].items(), key=lambda x: x[1])) substitute_words_dict = OrderedDict( sorted(word_count_list[2].items(), key=lambda x: x[1])) except TypeError as t: string = f'{t}' logger.debug(string) print(string) deleted_words_dict = None inserted_words_dict = None substitute_words_dict = None deleted_words = '' inserted_words = '' substitute_words = '' if deleted_words_dict: for k, v in deleted_words_dict.items(): deleted_words += f'{k}:{v}, ' if inserted_words_dict: for k, v in inserted_words_dict.items(): inserted_words += f'{k}:{v}, ' if substitute_words_dict: for k, v in substitute_words_dict.items(): substitute_words += f'{k}:{v}, ' full_path = f'{self.get_result_path()}/{self._result_file_name}' alts = '' for item in (configuration.get_alternative_language_codes()): alts += item + ' ' string = f'{uri}, {configuration.get_model()}, {configuration.get_use_enhanced()}, {configuration.get_language_code()},' \ f'{alts}, {bool(configuration.get_phrases())},' \ f'{configuration.get_boost()}, {ref_total_word_count}, {ref_error_count}, {word_error_rate}, {nlp_model.get_apply_stemming()},' \ f'{nlp_model.get_remove_stop_words()}, {nlp_model.get_n2w()}, {nlp_model.get_expand_contractions()}, {ins}, {deletions}, {subs}, ' \ f'{deleted_words}, {inserted_words}, {substitute_words}\n' with open( full_path, 'a+', ) as file: try: file.write(string) except IOError as i: print(f'Can not update csv file: {i}') print(f'UPDATED: {full_path}') def write_html_diagnostic(self, wer_obj, unique_root, result_path): aligned_html = '<br>'.join(wer_obj.aligned_htmls) result_file = unique_root + '.html' write_path = f'{result_path}/{result_file}' with open(write_path, 'w') as f: try: f.write(aligned_html) except IOError as i: print(f'Can not write html diagnostic {write_path}: {i}') print(f'WROTE: diagnostic file: {write_path} ') def write_queue_file(self, data): try: with open(self._queue_file_name, 'a+') as f: if isinstance(data, str): info = data.split() else: info = data for item in info: f.write(item + ',') except IOError as e: print(f'Can not write diagnostic file: {e}') def read_queue_file(self): result = None try: with open(self._queue_file_name, 'r') as f: result = f.read() except IOError as e: print(f'Can not read queue file: {e}') except FileNotFoundError as x: print(f'Queue file not found: {x}') if not result: raise IOError('No contents found in queue') return result def write_hyp(self, file_name, text): import os.path if not os.path.exists(self.get_result_path()): os.makedirs(self.get_result_path()) p = f'{self.get_result_path()}/{file_name}' with open(p, 'w+') as f: f.write(text)
parser.add_argument('-alts', '--alternative_languages', default=None, nargs='+', required=False, help="Space separated list of language codes for auto language detection. Example en-IN en-US en-GB") parser.add_argument('-p', '--phrase_file', required=False, type=str, help='Path to file containing comma separated phrases') parser.add_argument('-b', '--boosts', default=list(), nargs='+', required=False, help=('Space separated list of boost values to evaluate for speech adaptation')) parser.add_argument('-ch', '--multi', required=False, type=int, help='Integer indicating the number of channels if more than one') parser.add_argument('-q', '--random_queue', required=False, action='store_true', help='Replaces default queue.txt with randomly named queue file') parser.add_argument('-fake', '--fake_hyp', required=False, action='store_true', help='Use a fake hypothesis for testing') parser.add_argument('-limit', '--limit', required=False, default=None,type= int, help = 'Limit to X number of audio files') parser.add_argument('-nzb', '--no_zeros_boost', required=False, action='store_true', help='skip boost of 0' ) parser.add_argument('-single', '--single_word', required=False, action='store_true', help='process each letter rather than whole words') parser.add_argument('-lf','--local_files_path', required=False, type=str, help='process local files', default=None) nlp_model = NLPModel() io_handler = IOHandler() nlp_options = NLPOptions() configuration = Configuration() # Turn on punctuation .. why not.. no bearing on WER configuration.set_enableAutomaticPunctuation(True) args = parser.parse_args() no_zeros_for_boost = args.no_zeros_boost process_each_letter = args.single_word local_files_path = args.local_files_path limit = args.limit cloud_store_uri = args.cloud_store_uri io_handler.set_result_path(args.local_results_path) only_transcribe = args.transcriptions_only nlp_model.set_n2w(args.numbers_to_words)
parser.add_argument('-l', '--langs', default=['en-US'], nargs='+', required=False, help="Space separated list of language codes. Each processed seperately. Example en-AU en-GB") parser.add_argument('-alts', '--alternative_languages', default=None, nargs='+', required=False, help="Space separated list of language codes for auto language detection. Example en-IN en-US en-GB") parser.add_argument('-p', '--phrase_file', required=False, type=str, help='Path to file containing comma separated phrases') parser.add_argument('-b', '--boosts', default=list(), nargs='+', required=False, help=('Space separated list of boost values to evaluate for speech adaptation')) parser.add_argument('-ch', '--multi', required=False, type=int, help='Integer indicating the number of channels if more than one') parser.add_argument('-a', '--alts2prime', required=False, action='store_true', help='Use each alternative language as a primary language') parser.add_argument('-q', '--random_queue', required=False, action='store_true', help='Replaces default queue.txt with randomly named queue file') parser.add_argument('-fake', '--fake_hyp', required=False, action='store_true', help='Use a fake hypothesis for testing') parser.add_argument('-limit', '--limit', required=False, default=None,type= int, help = 'Limit to X number of audio files') nlp_model = NLPModel() io_handler = IOHandler() nlp_options = NLPOptions() configuration = Configuration() # Turn on punctuation .. why not.. no bearing on WER configuration.set_enableAutomaticPunctuation(True) args = parser.parse_args() limit = args.limit cloud_store_uri = args.cloud_store_uri io_handler.set_result_path(args.local_results_path) only_transcribe = args.transcriptions_only nlp_model.set_n2w(args.numbers_to_words) nlp_model.set_apply_stemming(args.stem) nlp_model.set_remove_stop_words(args.remove_stop_words) nlp_model.set_expand_contractions(args.expand)
class NLPOptions(object): nlp_model = NLPModel() contractions = None def __init__(self): from utilities.contractions import contractions_dictionary self.contractions = contractions_dictionary def expand_contractions(self, text): import re contractions = self.contractions contractions_pattern = re.compile('({})'.format('|'.join( contractions.keys())), flags=re.IGNORECASE | re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = contractions.get(match) \ if contractions.get(match) \ else contractions.get(match.lower()) expanded_contraction = first_char + expanded_contraction[1:] # Hack for this bug: https://github.com/dipanjanS/practical-machine-learning-with-python/issues/24 if expanded_contraction == "as not": return "is not" return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text def convert_numbers_to_words(self, text): # does not handle things like 21st, 4th from nltk.tokenize import word_tokenize import inflect result = '' i = inflect.engine() word_tokens = word_tokenize(text) for w in word_tokens: if w.isdigit(): numword = i.number_to_words(w) result += f' {numword}' else: result += f' {w}' return result def remove_stop_words(self, text): from nltk.corpus import stopwords import string result = str() stop_words = set(stopwords.words('english')) words = text.split() table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in words] filtered = [w for w in stripped if not w in stop_words] for w in filtered: result += " " + w return result def apply_stemming(self, text): from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize results = "" ps = PorterStemmer() words = word_tokenize(text) for w in words: results += (ps.stem(w) + " ") return results def apply_nlp_options(self, nlp_model, hyp): stem = nlp_model.get_apply_stemming() stop = nlp_model.get_remove_stop_words() n2w = nlp_model.get_n2w() exp = nlp_model.get_expand_contractions() if stem and not stop and not n2w and not exp: return self.apply_stemming(hyp) elif not stem and stop and not n2w and not exp: return self.remove_stop_words(hyp) elif not stem and not stop and n2w and not exp: return self.convert_numbers_to_words(hyp) elif not stem and not stop and not n2w and exp: return self.expand_contractions(hyp) elif stem and stop and not n2w and not exp: step_one = self.remove_stop_words(hyp) return self.apply_stemming(step_one) elif stem and not stop and n2w and not exp: step_one = self.convert_numbers_to_words(hyp) return self.apply_stemming(step_one) elif stem and not stop and not n2w and exp: step_one = self.expand_contractions(hyp) return self.apply_stemming(step_one) elif not stem and stop and n2w and not exp: step_one = self.convert_numbers_to_words(hyp) return self.remove_stop_words(step_one) elif not stem and not stop and n2w and exp: step_one = self.expand_contractions(hyp) return self.convert_numbers_to_words(step_one) elif not stem and stop and n2w and exp: step_one = self.expand_contractions(hyp) step_two = self.convert_numbers_to_words(step_one) return self.remove_stop_words(step_two) elif stem and not stop and n2w and exp: step_one = self.expand_contractions(hyp) step_two = self.convert_numbers_to_words(step_one) return self.apply_stemming(step_two) elif stem and stop and n2w and not exp: step_one = self.convert_numbers_to_words(hyp) step_two = self.remove_stop_words(step_one) return self.apply_stemming(step_two) elif stem and stop and not n2w and exp: step_one = self.expand_contractions(hyp) step_two = self.remove_stop_words(step_one) return self.apply_stemming(step_two) elif stem and stop and n2w and not exp: step_one = self.convert_numbers_to_words(hyp) step_two = self.remove_stop_words(step_one) return self.apply_stemming(step_two) elif stem and stop and n2w and exp: step_one = self.expand_contractions(hyp) step_two = self.convert_numbers_to_words(step_one) step_three = self.remove_stop_words(step_two) return self.apply_stemming(step_three)