def _phonemize( # pylint: disable=too-many-arguments backend, text, separator, strip, njobs, prepend_text): """Auxiliary function to phonemize() Does the phonemization and returns the phonemized text. Raises a RuntimeError on error. """ # remember the text type for output (either list or string), force the text # as a list and ignore empty lines text_type = type(text) text = (line.strip(os.linesep) for line in str2list(text)) text = [line for line in text if line.strip()] # phonemize the text phonemized = backend.phonemize(text, separator=separator, strip=strip, njobs=njobs) # at that point, the phonemized text is a list of str. Format it as # expected by the parameters if prepend_text: return list(zip(text, phonemized)) if text_type == str: return list2str(phonemized) return phonemized
def _phonemize_postprocess(self, text, text_type, punctuation_marks): # restore the punctuation is asked for if self.preserve_punctuation: text = self._punctuator.restore(text, punctuation_marks) # output the result formatted as a string or a list of strings # according to type(text) return (list2str(text) if text_type in six.string_types else str2list(text))
def restore(cls, text, marks): """Restore punctuation in a text. This is the reverse operation of Punctuation.preserve(). It takes a list of punctuated chunks and a list of punctuation marks. It returns a a punctuated text as a list: ['hello', 'my world'], [',', '!'] -> ['hello, my world!'] """ return cls._restore_aux(str2list(text), marks, 0)
def test_str2list(): assert str2list('') == [''] assert str2list('a') == ['a'] assert str2list('ab') == ['ab'] assert str2list('a b') == ['a b'] assert str2list('a\nb') == ['a', 'b'] assert str2list('a\n\nb\n') == ['a', '', 'b']
def test_str2list(): assert str2list('') == [''] assert str2list('a') == ['a'] assert str2list('ab') == ['ab'] assert str2list('a b') == ['a b'] assert str2list(f'a{os.linesep}b') == ['a', 'b'] assert str2list( f'a{os.linesep}{os.linesep}b{os.linesep}') == ['a', '', 'b']
def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language""" # remember the text type for output (either list or string) text_type = type(text) # deals with punctuation: remove it and keep track of it for # restoration at the end if asked for punctuation_marks = [] if self.preserve_punctuation: text, punctuation_marks = self._punctuator.preserve(text) else: text = self._punctuator.remove(text) if njobs == 1: # phonemize the text forced as a string text = self._phonemize_aux(list2str(text), separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) log_storage = self.logger self.logger = None # we have here a list of phonemized chunks text = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in chunks(text, njobs)) # flatten them in a single list text = list(itertools.chain(*text)) # restore the log as it was before parallel processing self.logger = log_storage # restore the punctuation is asked for if self.preserve_punctuation: text = self._punctuator.restore(text, punctuation_marks) # output the result formatted as a string or a list of strings # according to type(text) return (list2str(text) if text_type in six.string_types else str2list(text))
def preserve(self, text): """Removes punctuation from `text`, allowing for furter restoration This method returns the text as a list of punctuated chunks, along with a list of punctuation marks for furter restoration: 'hello, my world!' -> ['hello', 'my world'], [',', '!'] """ text = str2list(text) preserved_text = [] preserved_marks = [] for num, line in enumerate(text): line, marks = self._preserve_line(line, num) preserved_text += line preserved_marks += marks return [line for line in preserved_text if line], preserved_marks