def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language""" text, text_type, punctuation_marks = self._phonemize_preprocess(text) if njobs == 1: # phonemize the text forced as a string text = self._phonemize_aux(list2str(text), separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) log_storage = self.logger self.logger = None # we have here a list of phonemized chunks text = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in chunks(text, njobs)) # flatten them in a single list text = list(itertools.chain(*text)) # restore the log as it was before parallel processing self.logger = log_storage return self._phonemize_postprocess(text, text_type, punctuation_marks)
def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language Parameters ---------- text (list of str): The text to be phonemized. Each string in the list is considered as a separated line. Each line is considered as a text utterance. Any empty utterance will be ignored. separator (Separator): string separators between phonemes, syllables and words, default to separator.default_separator. Syllable separator is considered only for the festival backend. Word separator is ignored by the 'espeak-mbrola' backend. strip (bool): If True, don't output the last word and phone separators of a token, default to False. njobs (int): The number of parallel jobs to launch. The input text is split in `njobs` parts, phonemized on parallel instances of the backend and the outputs are finally collapsed. Returns ------- phonemized text (list of str) : The input `text` phonemized for the given `language` and `backend`. Raises ------ RuntimeError if something went wrong during the phonemization """ if isinstance(text, str): # changed in phonemizer-3.0, warn the user raise RuntimeError( 'input text to phonemize() is str but it must be list of str') text, punctuation_marks = self._phonemize_preprocess(text) if njobs == 1: # phonemize the text forced as a string phonemized = self._phonemize_aux(text, 0, separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) # we have here a list of phonemized chunks phonemized = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)( # chunk[0] is the text, chunk[1] is the offset chunk[0], chunk[1], separator, strip) for chunk in zip(*chunks(text, njobs))) # flatten them in a single list phonemized = self._flatten(phonemized) return self._phonemize_postprocess(phonemized, punctuation_marks)
def phonemize(self, text, separator=default_separator, strip=False, njobs=1): text, text_type, punctuation_marks = self._phonemize_preprocess(text) lang_switches = [] if njobs == 1: # phonemize the text forced as a string text, lang_switches = self._phonemize_aux(list2str(text), separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) log_storage = self.logger self.logger = None # divide the input text in chunks, each chunk being processed in a # separate job text_chunks = chunks(text, njobs) # offset used below to recover the line numbers in the input text # wrt the chunks offset = [0] + cumsum( (c.count('\n') + 1 for c in text_chunks[:-1])) # we have here a list of (phonemized chunk, lang_switches) output = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in text_chunks) # flatten both the phonemized chunks and language switches in a # list. For language switches lines we need to add an offset to # have the correct lines numbers wrt the input text. text = list(itertools.chain(*(chunk[0] for chunk in output))) lang_switches = [chunk[1] for chunk in output] for i in range(len(lang_switches)): for j in range(len(lang_switches[i])): lang_switches[i][j] += offset[i] lang_switches = list(itertools.chain(*lang_switches)) # restore the log as it was before parallel processing self.logger = log_storage # warn the user if language switches occured during phonemization self._warn_on_lang_switch(lang_switches) # finally restore the punctuation return self._phonemize_postprocess(text, text_type, punctuation_marks)
def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language""" # remember the text type for output (either list or string) text_type = type(text) # deals with punctuation: remove it and keep track of it for # restoration at the end if asked for punctuation_marks = [] if self.preserve_punctuation: text, punctuation_marks = self._punctuator.preserve(text) else: text = self._punctuator.remove(text) if njobs == 1: # phonemize the text forced as a string text = self._phonemize_aux(list2str(text), separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) log_storage = self.logger self.logger = None # we have here a list of phonemized chunks text = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in chunks(text, njobs)) # flatten them in a single list text = list(itertools.chain(*text)) # restore the log as it was before parallel processing self.logger = log_storage # restore the punctuation is asked for if self.preserve_punctuation: text = self._punctuator.restore(text, punctuation_marks) # output the result formatted as a string or a list of strings # according to type(text) return (list2str(text) if text_type in six.string_types else str2list(text))
def test_chunks(): for n in range(1, 5): c = chunks(['a'], n) assert c == ['a'] assert chunks(['a', 'a'], 1) == ['a\na'] assert chunks(['a', 'a'], 2) == ['a', 'a'] assert chunks(['a', 'a'], 10) == ['a', 'a'] assert chunks(['a', 'a', 'a'], 1) == ['a\na\na'] assert chunks(['a', 'a', 'a'], 2) == ['a', 'a\na'] assert chunks(['a', 'a', 'a'], 3) == ['a', 'a', 'a'] assert chunks(['a', 'a', 'a'], 10) == ['a', 'a', 'a'] assert chunks(['a', 'a', 'a', 'a'], 1) == ['a\na\na\na'] assert chunks(['a', 'a', 'a', 'a'], 2) == ['a\na', 'a\na'] assert chunks(['a', 'a', 'a', 'a'], 3) == ['a', 'a', 'a\na'] assert chunks(['a', 'a', 'a', 'a'], 10) == ['a', 'a', 'a', 'a']
def test_chunks(): for i in range(1, 5): assert chunks(['a'], i) == ([['a']], [0]) assert chunks(['a', 'a'], 1) == ([['a', 'a']], [0]) assert chunks(['a', 'a'], 2) == ([['a'], ['a']], [0, 1]) assert chunks(['a', 'a'], 10) == ([['a'], ['a']], [0, 1]) assert chunks(['a', 'a', 'a'], 1) == ([['a', 'a', 'a']], [0]) assert chunks(['a', 'a', 'a'], 2) == ([['a'], ['a', 'a']], [0, 1]) assert chunks(['a', 'a', 'a'], 3) == ([['a'], ['a'], ['a']], [0, 1, 2]) assert chunks(['a', 'a', 'a'], 10) == ([['a'], ['a'], ['a']], [0, 1, 2]) assert chunks(['a', 'a', 'a', 'a'], 1) == ([['a', 'a', 'a', 'a']], [0]) assert chunks(['a', 'a', 'a', 'a'], 2) == ( [['a', 'a'], ['a', 'a']], [0, 2]) assert chunks(['a', 'a', 'a', 'a'], 3) == ( [['a'], ['a'], ['a', 'a']], [0, 1, 2]) assert chunks(['a', 'a', 'a', 'a'], 10) == ( [['a'], ['a'], ['a'], ['a']], [0, 1, 2, 3])