Esempio n. 1
0
def test_preserve_2(text, expected):
    marks = ".!;:,?"
    punct = Punctuation(marks=marks)
    assert text == punct.restore(*punct.preserve(text))

    output = phonemize(text,
                       backend="espeak",
                       preserve_punctuation=True,
                       punctuation_marks=marks)
    assert output == expected
Esempio n. 2
0
def test_preserve_2(text, output):
    marks = ".!;:,?"
    p = Punctuation(marks=marks)
    t, m = p.preserve(text)
    assert text == p.restore(t, m)

    o = phonemize(
        text, backend="espeak",
        preserve_punctuation=True, punctuation_marks=marks)
    assert o == output
Esempio n. 3
0
    def __init__(self, language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        self.logger = logger
        self.logger.info(
            'initializing backend %s-%s', self.name(), self.version())

        # load the grapheme to phoneme mapping
        profile = self._load_g2p_profile(language)
        self.tokenizer = segments.Tokenizer(profile=profile)

        # setup punctuation processing
        self.preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)
Esempio n. 4
0
def test_issue55(backend, marks, text, expected):
    if marks == 'default':
        marks = Punctuation.default_marks()
    language = 'cree' if backend == 'segments' else 'en-us'

    try:
        with pytest.raises(expected):
            phonemize(text,
                      language=language,
                      backend=backend,
                      preserve_punctuation=True,
                      punctuation_marks=marks)
    except TypeError:
        try:
            assert expected == phonemize(text,
                                         language=language,
                                         backend=backend,
                                         preserve_punctuation=True,
                                         punctuation_marks=marks)
        except RuntimeError:
            if backend == 'festival':
                # TODO on some installations festival fails to phonemize "?".
                # It ends with a segmentation fault. This seems to only appear
                # with festival-2.5 (but is working on travis and docker image)
                pass
Esempio n. 5
0
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 language_switch='keep-flags',
                 with_stress=False,
                 logger=get_logger()):
        super().__init__(language,
                         punctuation_marks=punctuation_marks,
                         preserve_punctuation=preserve_punctuation,
                         logger=logger)
        self.logger.debug('espeak is %s', self.espeak_path())

        # adapt some command line option to the espeak version (for
        # phoneme separation and IPA output)
        version = self.version()

        self.sep = '--sep=_'
        if version == '1.48.03' or version.split('.')[1] <= '47':
            self.sep = ''  # pragma: nocover

        self.ipa = '--ipa=3'
        if self.is_espeak_ng():  # this is espeak-ng
            self.ipa = '-x --ipa'

        # ensure the lang_switch argument is valid
        valid_lang_switch = ['keep-flags', 'remove-flags', 'remove-utterance']
        if language_switch not in valid_lang_switch:
            raise RuntimeError(
                'lang_switch argument "{}" invalid, must be in {}'.format(
                    language_switch, ", ".join(valid_lang_switch)))
        self._lang_switch = language_switch
        self._lang_switch_list = []

        self._with_stress = with_stress
Esempio n. 6
0
    def __init__(self, language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        super(self.__class__, self).__init__(
            language, punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation, logger=logger)

        self.script = get_package_resource('festival/phonemize.scm')
        self.logger.info('loaded {}'.format(self.script))
Esempio n. 7
0
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        # ensure the backend is installed on the system
        if not self.is_available():
            raise RuntimeError(  # pragma: nocover
                '{} not installed on your system'.format(self.name()))

        self._logger = logger
        self._logger.info('initializing backend %s-%s', self.name(),
                          '.'.join(str(v) for v in self.version()))

        # ensure the backend support the requested language
        self._language = self._init_language(language)

        # setup punctuation processing
        self._preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)
Esempio n. 8
0
 def __init__(self,
              language,
              punctuation_marks=Punctuation.default_marks(),
              preserve_punctuation=False,
              logger=get_logger()):
     # will be initialized in _init_language() from super().__init__()
     self._tokenizer = None
     super().__init__(language,
                      punctuation_marks=punctuation_marks,
                      preserve_punctuation=preserve_punctuation,
                      logger=logger)
Esempio n. 9
0
def test_custom():
    punct = Punctuation()
    assert set(punct.marks) == set(punct.default_marks())
    assert punct.remove('a,b.c') == 'a b c'

    with pytest.raises(ValueError):
        punct.marks = ['?', '.']
    punct.marks = '?.'
    assert len(punct.marks) == 2
    assert punct.remove('a,b.c') == 'a,b c'
Esempio n. 10
0
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        self._espeak = EspeakWrapper()
        logger.debug('loaded %s', self._espeak.library_path)

        super().__init__(language,
                         punctuation_marks=punctuation_marks,
                         preserve_punctuation=preserve_punctuation,
                         logger=logger)
Esempio n. 11
0
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        super().__init__(language,
                         punctuation_marks=punctuation_marks,
                         preserve_punctuation=preserve_punctuation,
                         logger=logger)

        self.logger.debug('festival executable is %s', self.executable())

        # the Scheme script to be send to festival
        script_file = get_package_resource('festival/phonemize.scm')
        with open(script_file, 'r') as fscript:
            self._script = fscript.read()
        self.logger.debug('loaded %s', script_file)
Esempio n. 12
0
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 with_stress=False,
                 tie=False,
                 language_switch='keep-flags',
                 words_mismatch='ignore',
                 logger=get_logger()):
        super().__init__(language,
                         punctuation_marks=punctuation_marks,
                         preserve_punctuation=preserve_punctuation,
                         logger=logger)

        self._espeak.set_voice(language)
        self._with_stress = with_stress
        self._tie = self._init_tie(tie)
        self._lang_switch = get_language_switch_processor(
            language_switch, self.logger, self.language)
Esempio n. 13
0
class BaseBackend(object):
    """Abstract base class of all the phonemization backends

    Provides a common interface to all backends. The central method is
    `phonemize()`

    """
    __metaclass__ = abc.ABCMeta

    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        # ensure the backend is installed on the system
        if not self.is_available():
            raise RuntimeError(  # pragma: nocover
                '{} not installed on your system'.format(self.name()))

        self.logger = logger
        self.logger.info('initializing backend %s-%s', self.name(),
                         self.version())

        # ensure the backend support the requested language
        if not self.is_supported_language(language):
            raise RuntimeError(
                'language "{}" is not supported by the {} backend'.format(
                    language, self.name()))
        self.language = language

        # setup punctuation processing
        self.preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)

    @staticmethod
    @abc.abstractmethod
    def name():
        """The name of the backend"""
        pass

    @classmethod
    @abc.abstractmethod
    def is_available(cls):
        """Returns True if the backend is installed, False otherwise"""
        pass

    @staticmethod
    @abc.abstractmethod
    def version():
        """Return the backend version as a string 'major.minor.patch'"""
        pass

    @staticmethod
    @abc.abstractmethod
    def supported_languages():
        """Return a dict of language codes -> name supported by the backend"""
        pass

    @classmethod
    @abc.abstractmethod
    def is_supported_language(cls, language):
        """Returns True if `language` is supported by the backend"""
        return language in cls.supported_languages()

    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        # remember the text type for output (either list or string)
        text_type = type(text)

        # deals with punctuation: remove it and keep track of it for
        # restoration at the end if asked for
        punctuation_marks = []
        if self.preserve_punctuation:
            text, punctuation_marks = self._punctuator.preserve(text)
        else:
            text = self._punctuator.remove(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))

    @abc.abstractmethod
    def _phonemize_aux(self, text, separator, strip):
        pass
Esempio n. 14
0
class BaseBackend(abc.ABC):
    """Abstract base class of all the phonemization backends

    Provides a common interface to all backends. The central method is
    `phonemize()`

    Parameters
    ----------
    language (str): The language code of the input text, must be supported by
      the backend. If `backend` is 'segments', the language can be a file with
      a grapheme to phoneme mapping.

    preserve_punctuation (bool): When True, will keep the punctuation in the
      phonemized output. Not supported by the 'espeak-mbrola' backend. Default
      to False and remove all the punctuation.

    punctuation_marks (str): The punctuation marks to consider when dealing
      with punctuation, either for removal or preservation. Default to
      Punctuation.default_marks().

    logger (logging.Logger): the logging instance where to send
      messages. If not specified, use the default system logger.

    Raises
    ------
    RuntimeError if the backend is not available of if the `language` cannot be
    initialized.

    """
    def __init__(self,
                 language,
                 punctuation_marks=Punctuation.default_marks(),
                 preserve_punctuation=False,
                 logger=get_logger()):
        # ensure the backend is installed on the system
        if not self.is_available():
            raise RuntimeError(  # pragma: nocover
                '{} not installed on your system'.format(self.name()))

        self._logger = logger
        self._logger.info('initializing backend %s-%s', self.name(),
                          '.'.join(str(v) for v in self.version()))

        # ensure the backend support the requested language
        self._language = self._init_language(language)

        # setup punctuation processing
        self._preserve_punctuation = preserve_punctuation
        self._punctuator = Punctuation(punctuation_marks)

    @classmethod
    def _init_language(cls, language):
        """Language initialization

        This method may be overloaded in child classes (see Segments backend)

        """
        if not cls.is_supported_language(language):
            raise RuntimeError(
                f'language "{language}" is not supported by the '
                f'{cls.name()} backend')
        return language

    @property
    def logger(self):
        """A logging.Logger instance where to send messages"""
        return self._logger

    @property
    def language(self):
        """The language code configured to be used for phonemization"""
        return self._language

    @staticmethod
    @abc.abstractmethod
    def name():
        """The name of the backend"""

    @classmethod
    @abc.abstractmethod
    def is_available(cls):
        """Returns True if the backend is installed, False otherwise"""

    @classmethod
    @abc.abstractmethod
    def version(cls):
        """Return the backend version as a tuple (major, minor, patch)"""

    @staticmethod
    @abc.abstractmethod
    def supported_languages():
        """Return a dict of language codes -> name supported by the backend"""

    @classmethod
    def is_supported_language(cls, language):
        """Returns True if `language` is supported by the backend"""
        return language in cls.supported_languages()

    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language

        Parameters
        ----------
        text (list of str): The text to be phonemized. Each string in the list
          is considered as a separated line. Each line is considered as a text
          utterance. Any empty utterance will be ignored.

        separator (Separator): string separators between phonemes, syllables
          and words, default to separator.default_separator. Syllable separator
          is considered only for the festival backend. Word separator is
          ignored by the 'espeak-mbrola' backend.

        strip (bool): If True, don't output the last word and phone separators
          of a token, default to False.

        njobs (int): The number of parallel jobs to launch. The input text is
          split in `njobs` parts, phonemized on parallel instances of the
          backend and the outputs are finally collapsed.

        Returns
        -------
        phonemized text (list of str) : The input `text` phonemized for the
          given `language` and `backend`.

        Raises
        ------
        RuntimeError if something went wrong during the phonemization

        """
        if isinstance(text, str):
            # changed in phonemizer-3.0, warn the user
            self.logger.error(
                'input text to phonemize() is str but it must be list')

        text, punctuation_marks = self._phonemize_preprocess(text)

        if njobs == 1:
            # phonemize the text forced as a string
            phonemized = self._phonemize_aux(text, 0, separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)

            # we have here a list of phonemized chunks
            phonemized = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(
                    # chunk[0] is the text, chunk[1] is the offset
                    chunk[0],
                    chunk[1],
                    separator,
                    strip) for chunk in zip(*chunks(text, njobs)))

            # flatten them in a single list
            phonemized = self._flatten(phonemized)

        return self._phonemize_postprocess(phonemized, punctuation_marks)

    @staticmethod
    def _flatten(phonemized):
        """Flatten a list of lists into a single one

        From [[1, 2], [3], [4]] returns [1, 2, 3, 4]. This method is used to
        format the output as obtained using multiple jobs.

        """
        return list(itertools.chain(*phonemized))

    @abc.abstractmethod
    def _phonemize_aux(self, text, offset, separator, strip):
        """The "concrete" phonemization method

        Must be implemented in child classes. `separator` and `strip`
        parameters are as given to the phonemize() method. `text` is as
        returned by _phonemize_preprocess(). `offset` is line number of the
        first line in `text` with respect to the original text (this is only
        usefull with running on chunks in multiple jobs. When using a single
        jobs the offset is 0).

        """

    def _phonemize_preprocess(self, text):
        """Preprocess the text before phonemization

        Removes the punctuation (keep trace of punctuation marks for further
        restoration if required by the `preserve_punctuation` option).

        """
        if self._preserve_punctuation:
            # a tuple (text, punctuation marks)
            return self._punctuator.preserve(text)
        return self._punctuator.remove(text), []

    def _phonemize_postprocess(self, phonemized, punctuation_marks):
        """Postprocess the raw phonemized output

        Restores the punctuation as needed.

        """
        if self._preserve_punctuation:
            return self._punctuator.restore(phonemized, punctuation_marks)
        return phonemized
Esempio n. 15
0
def test_preserve(inp):
    punct = Punctuation()
    text, marks = punct.preserve(inp)
    assert inp == punct.restore(text, marks)
Esempio n. 16
0
def test_remove(inp, out):
    assert Punctuation().remove(inp) == out
Esempio n. 17
0
def phonemize(  # pylint: disable=too-many-arguments
    text,
    language='en-us',
    backend='espeak',
    separator=default_separator,
    strip=False,
    prepend_text=False,
    preserve_punctuation=False,
    punctuation_marks=Punctuation.default_marks(),
    with_stress=False,
    tie=False,
    language_switch='keep-flags',
    words_mismatch='ignore',
    njobs=1,
    logger=get_logger()):
    """Multilingual text to phonemes converter

    Return a phonemized version of an input `text`, given its `language` and a
    phonemization `backend`.

    Note
    ----
    To improve the processing speed it is better to minimize the calls to this
    function: provide the input text as a list and call phonemize() a single
    time is much more efficient than calling it on each element of the list.
    Indeed the initialization of the phonemization backend can be expensive,
    especially for espeak. In one exemple,

    Do this:

    >>> text = [line1, line2, ...]
    >>> phonemize(text, ...)

    Not this:

    >>> for line in text:
    >>>     phonemize(line, ...)

    Parameters
    ----------
    text (str or list of str): The text to be phonemized. Any empty line will
      be ignored. If `text` is an str, it can be multiline (lines being
      separated by \n). If `text` is a list, each element is considered as a
      separated line. Each line is considered as a text utterance.

    language (str): The language code of the input text, must be supported by
      the backend. If `backend` is 'segments', the language can be a file with
      a grapheme to phoneme mapping.

    backend (str, optional): The software backend to use for phonemization,
      must be 'festival' (US English only is supported, coded 'en-us'),
      'espeak', 'espeak-mbrola' or 'segments'.

    separator (Separator): string separators between phonemes, syllables and
      words, default to separator.default_separator. Syllable separator is
      considered only for the festival backend. Word separator is ignored by
      the 'espeak-mbrola' backend. Initialize it as follows:
        >>> from phonemizer.separator import Separator
        >>> separator = Separator(phone='-', word=' ')

    strip (bool, optional): If True, don't output the last word and phone
      separators of a token, default to False.

    prepend_text (bool, optional): When True, returns a pair (input utterance,
      phonemized utterance) for each line of the input text. When False,
      returns only the phonemized utterances. Default to False

    preserve_punctuation (bool, optional): When True, will keep the punctuation
      in the phonemized output. Not supported by the 'espeak-mbrola' backend.
      Default to False and remove all the punctuation.

    punctuation_marks (str, optional): The punctuation marks to consider when
      dealing with punctuation, either for removal or preservation. Default to
      Punctuation.default_marks().

    with_stress (bool, optional): This option is only valid for the 'espeak'
      backend. When True the stresses on phonemes are present (stresses
      characters are ˈ'ˌ). When False stresses are removed. Default to False.

    tie (bool or char, optional): This option is only valid for the 'espeak'
      backend with espeak>=1.49. It is incompatible with phone separator. When
      not False, use a tie character within multi-letter phoneme names. When
      True, the char 'U+361' is used (as in d͡ʒ), 'z' means ZWJ character,
      default to False.

    language_switch (str, optional): Espeak can output some words in another
      language (typically English) when phonemizing a text. This option setups
      the policy to use when such a language switch occurs. Three values are
      available: 'keep-flags' (the default), 'remove-flags' or
      'remove-utterance'. The 'keep-flags' policy keeps the language switching
      flags, for example "(en) or (jp)", in the output. The 'remove-flags'
      policy removes them and the 'remove-utterance' policy removes the whole
      line of text including a language switch. This option is only valid for
      the 'espeak' backend.

    words_mismatch (str, optional): Espeak can join two consecutive words or
      drop some words, yielding a word count mismatch between orthographic and
      phonemized text. This option setups the policy to use when such a words
      count mismatch occurs. Three values are available: 'ignore' (the default)
      which do nothing, 'warn' which issue a warning for each mismatched line,
      and 'remove' which remove the mismatched lines from the output.

    njobs (int): The number of parallel jobs to launch. The input text is split
      in `njobs` parts, phonemized on parallel instances of the backend and the
      outputs are finally collapsed.

    logger (logging.Logger): the logging instance where to send messages. If
      not specified, use the default system logger.

    Returns
    -------
    phonemized text (str or list of str) : The input `text` phonemized for the
      given `language` and `backend`. The returned value has the same type of
      the input text (either a list or a string), excepted if `prepend_input`
      is True where the output is forced as a list of pairs (input_text,
      phonemized text).

    Raises
    ------
    RuntimeError if the `backend` is not valid or is valid but not installed,
      if the `language` is not supported by the `backend`, if any incompatible
      options are used.

    """
    # ensure we are using a compatible Python version
    if sys.version_info < (3, 6):  # pragma: nocover
        logger.error(
            'Your are using python-%s which is unsupported by the phonemizer, '
            'please update to python>=3.6', ".".join(sys.version_info))

    # ensure the arguments are valid
    _check_arguments(backend, with_stress, tie, separator, language_switch,
                     words_mismatch)

    # preserve_punctuation and word separator not valid for espeak-mbrola
    if backend == 'espeak-mbrola' and preserve_punctuation:
        logger.warning('espeak-mbrola backend cannot preserve punctuation')
    if backend == 'espeak-mbrola' and separator.word:
        logger.warning('espeak-mbrola backend cannot preserve word separation')

    # initialize the phonemization backend
    if backend == 'espeak':
        phonemizer = BACKENDS[backend](
            language,
            punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation,
            with_stress=with_stress,
            tie=tie,
            language_switch=language_switch,
            words_mismatch=words_mismatch,
            logger=logger)
    elif backend == 'espeak-mbrola':
        phonemizer = BACKENDS[backend](language, logger=logger)
    else:  # festival or segments
        phonemizer = BACKENDS[backend](
            language,
            punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation,
            logger=logger)

    # do the phonemization
    return _phonemize(phonemizer, text, separator, strip, njobs, prepend_text)
Esempio n. 18
0
def phonemize(
        text,
        language='en-us',
        backend='festival',
        separator=default_separator,
        strip=False,
        preserve_punctuation=False,
        punctuation_marks=Punctuation.default_marks(),
        with_stress=False,
        language_switch='keep-flags',
        njobs=1,
        logger=get_logger()):
    """Multilingual text to phonemes converter

    Return a phonemized version of an input `text`, given its
    `language` and a phonemization `backend`.

    Parameters
    ----------
    text (str or list of str): The text to be phonemized. Any empty
       line will be ignored. If `text` is an str, it can be multiline
       (lines being separated by \n). If `text` is a list, each
       element is considered as a separated line. Each line is
       considered as a text utterance.

    language (str): The language code of the input text, must be
      supported by the backend. If `backend` is 'segments', the
      language can be a file with a grapheme to phoneme mapping.

    backend (str): The software backend to use for phonemization, must
      be 'festival' (US English only is supported, coded 'en-us'),
      'espeak', 'espeak-mbrola' or 'segments'.

    separator (Separator): string separators between phonemes, syllables and
      words, default to separator.default_separator. Syllable separator is
      considered only for the festival backend. Word separator is ignored by
      the 'espeak-mbrola' backend.

    strip (bool): If True, don't output the last word and phone
      separators of a token, default to False.

    preserve_punctuation (bool): When True, will keep the punctuation in the
        phonemized output. Not supportyed by the 'espeak-mbrola' backend.
        Default to False and remove all the punctuation.

    punctuation_marks (str): The punctuation marks to consider when dealing
        with punctuation. Default to Punctuation.default_marks().

    with_stress (bool): This option is only valid for the 'espeak' backend.
      When True the stresses on phonemes are present (stresses characters are
      ˈ'ˌ). When False stresses are removed. Default to False.

    language_switch (str): Espeak can output some words in another language
      (typically English) when phonemizing a text. This option setups the
      policy to use when such a language switch occurs. Three values are
      available: 'keep-flags' (the default), 'remove-flags' or
      'remove-utterance'. The 'keep-flags' policy keeps the language switching
      flags, for example (en) or (jp), in the output. The 'remove-flags' policy
      removes them and the 'remove-utterance' policy removes the whole line of
      text including a language switch. This option is only valid for the
      'espeak' backend.

    njobs (int): The number of parallel jobs to launch. The input text
      is split in `njobs` parts, phonemized on parallel instances of
      the backend and the outputs are finally collapsed.

    logger (logging.Logger): the logging instance where to send
      messages. If not specified, use the default system logger.

    Returns
    -------
    phonemized text (str or list of str) : The input `text` phonemized
      for the given `language` and `backend`. The returned value has
      the same type of the input text (either a list or a string).

    Raises
    ------
    RuntimeError if the `backend` is not valid or is valid but not installed,
      if the `language` is not supported by the `backend`, if with_stress` or
      `language_switch` are used but the backend is not 'espeak'.

    """
    # ensure the backend is either espeak, festival or segments
    if backend not in ('espeak', 'espeak-mbrola', 'festival', 'segments'):
        raise RuntimeError(
            '{} is not a supported backend, choose in {}.'
            .format(backend, ', '.join(
                ('espeak', 'espeak-mbrola', 'festival', 'segments'))))

    # with_stress option only valid for espeak
    if with_stress and backend != 'espeak':
        raise RuntimeError(
            'the "with_stress" option is available for espeak backend only, '
            'but you are using {} backend'.format(backend))

    # language_switch option only valid for espeak
    if (
            language_switch != 'keep-flags'
            and backend not in ('espeak', 'espeak-mbrola')
    ):
        raise RuntimeError(
            'the "language_switch" option is available for espeak backend '
            'only, but you are using {} backend'.format(backend))

    # preserve_punctuation and word separator not valid for espeak-mbrola
    if backend == 'espeak-mbrola' and preserve_punctuation:
        logger.warning('espeak-mbrola backend cannot preserve punctuation')
    if backend == 'espeak-mbrola' and separator.word:
        logger.warning('espeak-mbrola backend cannot preserve word separation')

    # python2 needs additional utf8 encoding
    if sys.version_info[0] == 2:  # pragma: nocover
        logger.warning(
            'Your are using python2 but unsupported by the phonemizer, '
            'please update to python>=3.6')

    # instanciate the requested backend for the given language (raises
    # a RuntimeError if the language is not supported).
    backends = {b.name(): b for b in (
        EspeakBackend, FestivalBackend, SegmentsBackend, EspeakMbrolaBackend)}

    if backend == 'espeak':
        phonemizer = backends[backend](
            language,
            punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation,
            with_stress=with_stress,
            language_switch=language_switch,
            logger=logger)
    elif backend == 'espeak-mbrola':
        phonemizer = backends[backend](
            language,
            logger=logger)
    else:  # festival or segments
        phonemizer = backends[backend](
            language,
            punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation,
            logger=logger)

    # phonemize the input text
    return phonemizer.phonemize(
        text, separator=separator, strip=strip, njobs=njobs)
Esempio n. 19
0
def test_preserve(inp):
    p = Punctuation()
    t, m = p.preserve(inp)
    assert inp == p.restore(t, m)