def test_apply_nlp_options_stop():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = 'thingy in the dancing'
    options = NLPOptions()
    model = NLPModel()
    model.set_remove_stop_words(True)
    result = options.apply_nlp_options(model, hyp)
    expected = ' thingy dancing'
    assert result == expected
def test_apply_nlp_options_n2w():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = '34 567'
    options = NLPOptions()
    model = NLPModel()
    model.set_n2w(True)
    result = options.apply_nlp_options(model, hyp)
    expected = ' thirty-four five hundred and sixty-seven'
    assert result == expected
def test_apply_nlp_options_expand_cont():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "isn't can't"
    options = NLPOptions()
    model = NLPModel()
    model.set_expand_contractions(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'is not cannot'
    assert result == expected
def test_apply_nlp_options_stem():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = 'thingy dancing'
    options = NLPOptions()
    model = NLPModel()
    model.set_apply_stemming(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'thingi danc '
    assert result == expected
def test_stem_stop_n2w_exp():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "don't in the won't didn't isn't raining 3 6 9"
    options = NLPOptions()
    model = NLPModel()
    model.set_n2w(True)
    model.set_remove_stop_words(True)
    model.set_apply_stemming(True)
    model.set_expand_contractions(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'rain three six nine '
    assert result == expected
def test_stem_stop_n2w():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "666 don't in the raining"
    options = NLPOptions()
    model = NLPModel()
    model.set_n2w(True)
    model.set_remove_stop_words(True)
    model.set_apply_stemming(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'six hundr sixtysix nt rain '
    assert result == expected
def test_apply_nlp_options_stem_n2w_exp():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "666 don't in the raining"
    options = NLPOptions()
    model = NLPModel()
    model.set_n2w(True)
    model.set_expand_contractions(True)
    model.set_apply_stemming(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'six hundr and sixty-six do not in the rain '
    assert result == expected
def test_apply_nlp_options_stop_n2w_exp():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "666 don't in the"
    options = NLPOptions()
    model = NLPModel()
    model.set_n2w(True)
    model.set_expand_contractions(True)
    model.set_remove_stop_words(True)
    result = options.apply_nlp_options(model, hyp)
    expected = ' six hundred sixtysix'
    assert result == expected
Esempio n. 9
0
def test_create_unique_root_2():
    from utilities.utilities import Utilities
    from model.configuration import Configuration
    from model.nlp import NLPModel
    u = Utilities()
    configuration = Configuration()
    nlp_model = NLPModel()
    root = '12345'
    configuration.set_model('video')
    configuration.set_use_enhanced(False)
    configuration.set_language_code('fr_FR')
    configuration.set_alternative_language_codes(['en-US', 'ru-RU'])
    configuration.set_speech_context('hi', 5)
    nlp_model.set_remove_stop_words(True)
    nlp_model.set_apply_stemming(False)
    nlp_model.set_expand_contractions(True)
    nlp_model.set_n2w(True)
    result = u.create_unique_root(root, configuration, nlp_model)
    expected = '12345_video_fr_FR_alts_applied_speech_adaptation_applied_boost_5_stop_words_removed_contractions_expanded_numbers_converted_2_words'
    assert result == expected
def test_update_csv():
    from utilities.io_handler import IOHandler
    from model.configuration import Configuration
    from model.nlp import NLPModel
    import os
    configuration = Configuration()
    nlp_model = NLPModel()
    io = IOHandler()
    result_file_name = io._result_file_name
    io.set_result_path('test_results_path')
    io.write_csv_header()
    expected_uri = 'gs://foo/bar/baz/test.flac'
    expected_lang = 'fr-FR'
    nlp_model.set_apply_stemming(True)
    configuration.set_language_code(expected_lang)
    io.update_csv(expected_uri, configuration, nlp_model)
    full_path = f'{io.get_result_path()}/{result_file_name}'

    with open(full_path, 'r') as file:
        contents = file.read()
        os.remove(full_path)
        assert expected_uri in contents
        assert expected_lang in contents
        assert 'True' in contents
def test_apply_nlp_options_stem_exp():
    from utilities.nlp_options import NLPOptions
    from model.nlp import NLPModel
    hyp = "thingy dancing won't"
    options = NLPOptions()
    model = NLPModel()
    model.set_apply_stemming(True)
    model.set_expand_contractions(True)
    result = options.apply_nlp_options(model, hyp)
    expected = 'thingi danc will not '
    assert result == expected
class IOHandler(object):
    _result_path = ''
    _result_file_name = 'results.csv'
    _csv_header = 'AUDIO_FILE, MODEL, ENHANCED, LANGUAGE, ALTERNATIVE_LANGS, PHRASE_HINTS_APPLIED, BOOST, REF_WORD_COUNT, REF_ERROR_COUNT , WER,STEMMING_APPLIED , STOP_WORDS_REMOVED, NUMBER_TO_WORD_CONVERSION, CONTRACTIONS_EXPANDED, INSERTIONS, DELETIONS, SUBSTITUTIONS, DELETED_WORDS, INSERTED_WORDS, SUBSTITUTE_WORDS\n'
    _csv_header_written = False
    configuration = Configuration()
    nlp_model = NLPModel()
    _queue_file_name = 'queue.txt'

    def set_queue_file_name(self, name):
        self._queue_file_name = name

    def get_queue_file_name(self):
        return self._queue_file_name

    def set_result_path(self, result_path):
        self._result_path = result_path

    def get_result_path(self):
        return self._result_path

    def write_csv_header(self):
        import os
        if not self._csv_header_written:
            full_path = f'{self.get_result_path()}/{self._result_file_name}'
            # if path does not exists, make it
            if not os.path.exists(self.get_result_path()):
                os.makedirs(self.get_result_path())

            with open(full_path, 'w') as file:
                try:
                    file.write(self._csv_header)
                except IOError as i:
                    print(f'Can not write csv header: {i}')
                except FileNotFoundError as x:
                    print(f'Can not find csv file: {x}')
            self._csv_header_written = True

    def update_csv(self,
                   uri,
                   configuration,
                   nlp_model,
                   word_count_list=None,
                   ref_total_word_count=0,
                   ref_error_count=0,
                   word_error_rate=0,
                   ins=0,
                   deletions=0,
                   subs=0):
        import logging
        logging.basicConfig(filename='wer_app.log')
        logger = logging.getLogger(__name__)
        from collections import OrderedDict
        deleted_words_dict = dict()
        inserted_words_dict = dict()
        substitute_words_dict = dict()

        if word_count_list:
            try:
                deleted_words_dict = OrderedDict(
                    sorted(word_count_list[0].items(), key=lambda x: x[1]))
                inserted_words_dict = OrderedDict(
                    sorted(word_count_list[1].items(), key=lambda x: x[1]))
                substitute_words_dict = OrderedDict(
                    sorted(word_count_list[2].items(), key=lambda x: x[1]))
            except TypeError as t:
                string = f'{t}'
                logger.debug(string)
                print(string)
                deleted_words_dict = None
                inserted_words_dict = None
                substitute_words_dict = None
        deleted_words = ''
        inserted_words = ''
        substitute_words = ''
        if deleted_words_dict:
            for k, v in deleted_words_dict.items():
                deleted_words += f'{k}:{v}, '
        if inserted_words_dict:
            for k, v in inserted_words_dict.items():
                inserted_words += f'{k}:{v}, '
        if substitute_words_dict:
            for k, v in substitute_words_dict.items():
                substitute_words += f'{k}:{v}, '

        full_path = f'{self.get_result_path()}/{self._result_file_name}'
        alts = ''
        for item in (configuration.get_alternative_language_codes()):
            alts += item + ' '
        string = f'{uri}, {configuration.get_model()}, {configuration.get_use_enhanced()}, {configuration.get_language_code()},' \
                 f'{alts}, {bool(configuration.get_phrases())},' \
                 f'{configuration.get_boost()}, {ref_total_word_count}, {ref_error_count}, {word_error_rate}, {nlp_model.get_apply_stemming()},' \
                 f'{nlp_model.get_remove_stop_words()}, {nlp_model.get_n2w()}, {nlp_model.get_expand_contractions()}, {ins}, {deletions}, {subs}, ' \
                 f'{deleted_words}, {inserted_words}, {substitute_words}\n'
        with open(
                full_path,
                'a+',
        ) as file:
            try:
                file.write(string)
            except IOError as i:
                print(f'Can not update csv file: {i}')
        print(f'UPDATED: {full_path}')

    def write_html_diagnostic(self, wer_obj, unique_root, result_path):
        aligned_html = '<br>'.join(wer_obj.aligned_htmls)

        result_file = unique_root + '.html'
        write_path = f'{result_path}/{result_file}'
        with open(write_path, 'w') as f:
            try:
                f.write(aligned_html)
            except IOError as i:
                print(f'Can not write html diagnostic {write_path}: {i}')
        print(f'WROTE: diagnostic file: {write_path} ')

    def write_queue_file(self, data):
        try:
            with open(self._queue_file_name, 'a+') as f:
                if isinstance(data, str):
                    info = data.split()
                else:
                    info = data
                for item in info:
                    f.write(item + ',')
        except IOError as e:
            print(f'Can not write diagnostic file: {e}')

    def read_queue_file(self):
        result = None
        try:
            with open(self._queue_file_name, 'r') as f:
                result = f.read()
        except IOError as e:
            print(f'Can not read queue file: {e}')
        except FileNotFoundError as x:
            print(f'Queue file not found: {x}')
        if not result:
            raise IOError('No contents found in queue')
        return result

    def write_hyp(self, file_name, text):
        import os.path

        if not os.path.exists(self.get_result_path()):
            os.makedirs(self.get_result_path())

        p = f'{self.get_result_path()}/{file_name}'

        with open(p, 'w+') as f:
            f.write(text)
    parser.add_argument('-alts', '--alternative_languages', default=None, nargs='+', required=False,
                        help="Space separated list of language codes for auto language detection. Example en-IN en-US en-GB")
    parser.add_argument('-p', '--phrase_file', required=False, type=str,
                        help='Path to file containing comma separated phrases')
    parser.add_argument('-b', '--boosts', default=list(), nargs='+', required=False,
                        help=('Space separated list of boost values to evaluate for speech adaptation'))
    parser.add_argument('-ch', '--multi', required=False, type=int,
                        help='Integer indicating the number of channels if more than one')
    parser.add_argument('-q', '--random_queue', required=False, action='store_true', help='Replaces default queue.txt with randomly named queue file')
    parser.add_argument('-fake', '--fake_hyp',  required=False, action='store_true', help='Use a fake hypothesis for testing')
    parser.add_argument('-limit', '--limit', required=False, default=None,type= int,  help = 'Limit to X number of audio files')
    parser.add_argument('-nzb', '--no_zeros_boost', required=False,  action='store_true', help='skip boost of 0' )
    parser.add_argument('-single', '--single_word', required=False, action='store_true', help='process each letter rather than whole words')
    parser.add_argument('-lf','--local_files_path', required=False, type=str, help='process local files',  default=None)

    nlp_model = NLPModel()
    io_handler = IOHandler()
    nlp_options = NLPOptions()
    configuration = Configuration()
    # Turn on punctuation ..  why not.. no bearing on WER
    configuration.set_enableAutomaticPunctuation(True)

    args = parser.parse_args()
    no_zeros_for_boost = args.no_zeros_boost
    process_each_letter = args.single_word
    local_files_path = args.local_files_path
    limit = args.limit
    cloud_store_uri = args.cloud_store_uri
    io_handler.set_result_path(args.local_results_path)
    only_transcribe = args.transcriptions_only
    nlp_model.set_n2w(args.numbers_to_words)
    parser.add_argument('-l', '--langs', default=['en-US'], nargs='+', required=False,
                        help="Space separated list of language codes.  Each processed seperately.  Example en-AU en-GB")
    parser.add_argument('-alts', '--alternative_languages', default=None, nargs='+', required=False,
                        help="Space separated list of language codes for auto language detection. Example en-IN en-US en-GB")
    parser.add_argument('-p', '--phrase_file', required=False, type=str,
                        help='Path to file containing comma separated phrases')
    parser.add_argument('-b', '--boosts', default=list(), nargs='+', required=False,
                        help=('Space separated list of boost values to evaluate for speech adaptation'))
    parser.add_argument('-ch', '--multi', required=False, type=int,
                        help='Integer indicating the number of channels if more than one')
    parser.add_argument('-a', '--alts2prime', required=False, action='store_true', help='Use each alternative language as a primary language')
    parser.add_argument('-q', '--random_queue', required=False, action='store_true', help='Replaces default queue.txt with randomly named queue file')
    parser.add_argument('-fake', '--fake_hyp',  required=False, action='store_true', help='Use a fake hypothesis for testing')
    parser.add_argument('-limit', '--limit', required=False, default=None,type= int,  help = 'Limit to X number of audio files')

    nlp_model = NLPModel()
    io_handler = IOHandler()
    nlp_options = NLPOptions()
    configuration = Configuration()
    # Turn on punctuation ..  why not.. no bearing on WER
    configuration.set_enableAutomaticPunctuation(True)

    args = parser.parse_args()
    limit = args.limit
    cloud_store_uri = args.cloud_store_uri
    io_handler.set_result_path(args.local_results_path)
    only_transcribe = args.transcriptions_only
    nlp_model.set_n2w(args.numbers_to_words)
    nlp_model.set_apply_stemming(args.stem)
    nlp_model.set_remove_stop_words(args.remove_stop_words)
    nlp_model.set_expand_contractions(args.expand)
class NLPOptions(object):
    nlp_model = NLPModel()
    contractions = None

    def __init__(self):
        from utilities.contractions import contractions_dictionary
        self.contractions = contractions_dictionary

    def expand_contractions(self, text):
        import re
        contractions = self.contractions
        contractions_pattern = re.compile('({})'.format('|'.join(
            contractions.keys())),
                                          flags=re.IGNORECASE | re.DOTALL)

        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contractions.get(match) \
                if contractions.get(match) \
                else contractions.get(match.lower())
            expanded_contraction = first_char + expanded_contraction[1:]
            # Hack for this bug: https://github.com/dipanjanS/practical-machine-learning-with-python/issues/24
            if expanded_contraction == "as not":
                return "is not"
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def convert_numbers_to_words(self, text):
        # does not handle things like 21st, 4th
        from nltk.tokenize import word_tokenize
        import inflect

        result = ''
        i = inflect.engine()
        word_tokens = word_tokenize(text)
        for w in word_tokens:
            if w.isdigit():
                numword = i.number_to_words(w)
                result += f' {numword}'
            else:
                result += f' {w}'
        return result

    def remove_stop_words(self, text):
        from nltk.corpus import stopwords
        import string
        result = str()
        stop_words = set(stopwords.words('english'))
        words = text.split()
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in words]
        filtered = [w for w in stripped if not w in stop_words]
        for w in filtered:
            result += " " + w
        return result

    def apply_stemming(self, text):
        from nltk.stem import PorterStemmer
        from nltk.tokenize import word_tokenize

        results = ""
        ps = PorterStemmer()
        words = word_tokenize(text)

        for w in words:
            results += (ps.stem(w) + " ")
        return results

    def apply_nlp_options(self, nlp_model, hyp):
        stem = nlp_model.get_apply_stemming()
        stop = nlp_model.get_remove_stop_words()
        n2w = nlp_model.get_n2w()
        exp = nlp_model.get_expand_contractions()

        if stem and not stop and not n2w and not exp:
            return self.apply_stemming(hyp)
        elif not stem and stop and not n2w and not exp:
            return self.remove_stop_words(hyp)
        elif not stem and not stop and n2w and not exp:
            return self.convert_numbers_to_words(hyp)
        elif not stem and not stop and not n2w and exp:
            return self.expand_contractions(hyp)
        elif stem and stop and not n2w and not exp:
            step_one = self.remove_stop_words(hyp)
            return self.apply_stemming(step_one)
        elif stem and not stop and n2w and not exp:
            step_one = self.convert_numbers_to_words(hyp)
            return self.apply_stemming(step_one)
        elif stem and not stop and not n2w and exp:
            step_one = self.expand_contractions(hyp)
            return self.apply_stemming(step_one)
        elif not stem and stop and n2w and not exp:
            step_one = self.convert_numbers_to_words(hyp)
            return self.remove_stop_words(step_one)
        elif not stem and not stop and n2w and exp:
            step_one = self.expand_contractions(hyp)
            return self.convert_numbers_to_words(step_one)
        elif not stem and stop and n2w and exp:
            step_one = self.expand_contractions(hyp)
            step_two = self.convert_numbers_to_words(step_one)
            return self.remove_stop_words(step_two)
        elif stem and not stop and n2w and exp:
            step_one = self.expand_contractions(hyp)
            step_two = self.convert_numbers_to_words(step_one)
            return self.apply_stemming(step_two)
        elif stem and stop and n2w and not exp:
            step_one = self.convert_numbers_to_words(hyp)
            step_two = self.remove_stop_words(step_one)
            return self.apply_stemming(step_two)
        elif stem and stop and not n2w and exp:
            step_one = self.expand_contractions(hyp)
            step_two = self.remove_stop_words(step_one)
            return self.apply_stemming(step_two)
        elif stem and stop and n2w and not exp:
            step_one = self.convert_numbers_to_words(hyp)
            step_two = self.remove_stop_words(step_one)
            return self.apply_stemming(step_two)
        elif stem and stop and n2w and exp:
            step_one = self.expand_contractions(hyp)
            step_two = self.convert_numbers_to_words(step_one)
            step_three = self.remove_stop_words(step_two)
            return self.apply_stemming(step_three)