def test_preserve_2(text, expected): marks = ".!;:,?" punct = Punctuation(marks=marks) assert text == punct.restore(*punct.preserve(text)) output = phonemize(text, backend="espeak", preserve_punctuation=True, punctuation_marks=marks) assert output == expected
def test_preserve_2(text, output): marks = ".!;:,?" p = Punctuation(marks=marks) t, m = p.preserve(text) assert text == p.restore(t, m) o = phonemize( text, backend="espeak", preserve_punctuation=True, punctuation_marks=marks) assert o == output
class BaseBackend(abc.ABC): """Abstract base class of all the phonemization backends Provides a common interface to all backends. The central method is `phonemize()` Parameters ---------- language (str): The language code of the input text, must be supported by the backend. If `backend` is 'segments', the language can be a file with a grapheme to phoneme mapping. preserve_punctuation (bool): When True, will keep the punctuation in the phonemized output. Not supported by the 'espeak-mbrola' backend. Default to False and remove all the punctuation. punctuation_marks (str): The punctuation marks to consider when dealing with punctuation, either for removal or preservation. Default to Punctuation.default_marks(). logger (logging.Logger): the logging instance where to send messages. If not specified, use the default system logger. Raises ------ RuntimeError if the backend is not available of if the `language` cannot be initialized. """ def __init__(self, language, punctuation_marks=Punctuation.default_marks(), preserve_punctuation=False, logger=get_logger()): # ensure the backend is installed on the system if not self.is_available(): raise RuntimeError( # pragma: nocover '{} not installed on your system'.format(self.name())) self._logger = logger self._logger.info('initializing backend %s-%s', self.name(), '.'.join(str(v) for v in self.version())) # ensure the backend support the requested language self._language = self._init_language(language) # setup punctuation processing self._preserve_punctuation = preserve_punctuation self._punctuator = Punctuation(punctuation_marks) @classmethod def _init_language(cls, language): """Language initialization This method may be overloaded in child classes (see Segments backend) """ if not cls.is_supported_language(language): raise RuntimeError( f'language "{language}" is not supported by the ' f'{cls.name()} backend') return language @property def logger(self): """A logging.Logger instance where to send messages""" return self._logger @property def language(self): """The language code configured to be used for phonemization""" return self._language @staticmethod @abc.abstractmethod def name(): """The name of the backend""" @classmethod @abc.abstractmethod def is_available(cls): """Returns True if the backend is installed, False otherwise""" @classmethod @abc.abstractmethod def version(cls): """Return the backend version as a tuple (major, minor, patch)""" @staticmethod @abc.abstractmethod def supported_languages(): """Return a dict of language codes -> name supported by the backend""" @classmethod def is_supported_language(cls, language): """Returns True if `language` is supported by the backend""" return language in cls.supported_languages() def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language Parameters ---------- text (list of str): The text to be phonemized. Each string in the list is considered as a separated line. Each line is considered as a text utterance. Any empty utterance will be ignored. separator (Separator): string separators between phonemes, syllables and words, default to separator.default_separator. Syllable separator is considered only for the festival backend. Word separator is ignored by the 'espeak-mbrola' backend. strip (bool): If True, don't output the last word and phone separators of a token, default to False. njobs (int): The number of parallel jobs to launch. The input text is split in `njobs` parts, phonemized on parallel instances of the backend and the outputs are finally collapsed. Returns ------- phonemized text (list of str) : The input `text` phonemized for the given `language` and `backend`. Raises ------ RuntimeError if something went wrong during the phonemization """ if isinstance(text, str): # changed in phonemizer-3.0, warn the user self.logger.error( 'input text to phonemize() is str but it must be list') text, punctuation_marks = self._phonemize_preprocess(text) if njobs == 1: # phonemize the text forced as a string phonemized = self._phonemize_aux(text, 0, separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) # we have here a list of phonemized chunks phonemized = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)( # chunk[0] is the text, chunk[1] is the offset chunk[0], chunk[1], separator, strip) for chunk in zip(*chunks(text, njobs))) # flatten them in a single list phonemized = self._flatten(phonemized) return self._phonemize_postprocess(phonemized, punctuation_marks) @staticmethod def _flatten(phonemized): """Flatten a list of lists into a single one From [[1, 2], [3], [4]] returns [1, 2, 3, 4]. This method is used to format the output as obtained using multiple jobs. """ return list(itertools.chain(*phonemized)) @abc.abstractmethod def _phonemize_aux(self, text, offset, separator, strip): """The "concrete" phonemization method Must be implemented in child classes. `separator` and `strip` parameters are as given to the phonemize() method. `text` is as returned by _phonemize_preprocess(). `offset` is line number of the first line in `text` with respect to the original text (this is only usefull with running on chunks in multiple jobs. When using a single jobs the offset is 0). """ def _phonemize_preprocess(self, text): """Preprocess the text before phonemization Removes the punctuation (keep trace of punctuation marks for further restoration if required by the `preserve_punctuation` option). """ if self._preserve_punctuation: # a tuple (text, punctuation marks) return self._punctuator.preserve(text) return self._punctuator.remove(text), [] def _phonemize_postprocess(self, phonemized, punctuation_marks): """Postprocess the raw phonemized output Restores the punctuation as needed. """ if self._preserve_punctuation: return self._punctuator.restore(phonemized, punctuation_marks) return phonemized
def test_preserve(inp): punct = Punctuation() text, marks = punct.preserve(inp) assert inp == punct.restore(text, marks)
class BaseBackend(object): """Abstract base class of all the phonemization backends Provides a common interface to all backends. The central method is `phonemize()` """ __metaclass__ = abc.ABCMeta def __init__(self, language, punctuation_marks=Punctuation.default_marks(), preserve_punctuation=False, logger=get_logger()): # ensure the backend is installed on the system if not self.is_available(): raise RuntimeError( # pragma: nocover '{} not installed on your system'.format(self.name())) self.logger = logger self.logger.info('initializing backend %s-%s', self.name(), self.version()) # ensure the backend support the requested language if not self.is_supported_language(language): raise RuntimeError( 'language "{}" is not supported by the {} backend'.format( language, self.name())) self.language = language # setup punctuation processing self.preserve_punctuation = preserve_punctuation self._punctuator = Punctuation(punctuation_marks) @staticmethod @abc.abstractmethod def name(): """The name of the backend""" pass @classmethod @abc.abstractmethod def is_available(cls): """Returns True if the backend is installed, False otherwise""" pass @staticmethod @abc.abstractmethod def version(): """Return the backend version as a string 'major.minor.patch'""" pass @staticmethod @abc.abstractmethod def supported_languages(): """Return a dict of language codes -> name supported by the backend""" pass @classmethod @abc.abstractmethod def is_supported_language(cls, language): """Returns True if `language` is supported by the backend""" return language in cls.supported_languages() def phonemize(self, text, separator=default_separator, strip=False, njobs=1): """Returns the `text` phonemized for the given language""" # remember the text type for output (either list or string) text_type = type(text) # deals with punctuation: remove it and keep track of it for # restoration at the end if asked for punctuation_marks = [] if self.preserve_punctuation: text, punctuation_marks = self._punctuator.preserve(text) else: text = self._punctuator.remove(text) if njobs == 1: # phonemize the text forced as a string text = self._phonemize_aux(list2str(text), separator, strip) else: # If using parallel jobs, disable the log as stderr is not # picklable. self.logger.info('running %s on %s jobs', self.name(), njobs) log_storage = self.logger self.logger = None # we have here a list of phonemized chunks text = joblib.Parallel(n_jobs=njobs)( joblib.delayed(self._phonemize_aux)(t, separator, strip) for t in chunks(text, njobs)) # flatten them in a single list text = list(itertools.chain(*text)) # restore the log as it was before parallel processing self.logger = log_storage # restore the punctuation is asked for if self.preserve_punctuation: text = self._punctuator.restore(text, punctuation_marks) # output the result formatted as a string or a list of strings # according to type(text) return (list2str(text) if text_type in six.string_types else str2list(text)) @abc.abstractmethod def _phonemize_aux(self, text, separator, strip): pass
def test_preserve(inp): p = Punctuation() t, m = p.preserve(inp) assert inp == p.restore(t, m)