Esempio n. 1
0
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        text, text_type, punctuation_marks = self._phonemize_preprocess(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        return self._phonemize_postprocess(text, text_type, punctuation_marks)
Esempio n. 2
0
    def phonemize(self, text, separator=default_separator,
                  strip=False, njobs=1):
        """Returns the `text` phonemized for the given language

        Parameters
        ----------
        text (list of str): The text to be phonemized. Each string in the list
          is considered as a separated line. Each line is considered as a text
          utterance. Any empty utterance will be ignored.

        separator (Separator): string separators between phonemes, syllables
          and words, default to separator.default_separator. Syllable separator
          is considered only for the festival backend. Word separator is
          ignored by the 'espeak-mbrola' backend.

        strip (bool): If True, don't output the last word and phone separators
          of a token, default to False.

        njobs (int): The number of parallel jobs to launch. The input text is
          split in `njobs` parts, phonemized on parallel instances of the
          backend and the outputs are finally collapsed.

        Returns
        -------
        phonemized text (list of str) : The input `text` phonemized for the
          given `language` and `backend`.

        Raises
        ------
        RuntimeError if something went wrong during the phonemization

        """
        if isinstance(text, str):
            # changed in phonemizer-3.0, warn the user
            raise RuntimeError(
                'input text to phonemize() is str but it must be list of str')

        text, punctuation_marks = self._phonemize_preprocess(text)

        if njobs == 1:
            # phonemize the text forced as a string
            phonemized = self._phonemize_aux(text, 0, separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)

            # we have here a list of phonemized chunks
            phonemized = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(
                    # chunk[0] is the text, chunk[1] is the offset
                    chunk[0], chunk[1], separator, strip)
                for chunk in zip(*chunks(text, njobs)))

            # flatten them in a single list
            phonemized = self._flatten(phonemized)

        return self._phonemize_postprocess(phonemized, punctuation_marks)
Esempio n. 3
0
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        text, text_type, punctuation_marks = self._phonemize_preprocess(text)
        lang_switches = []

        if njobs == 1:
            # phonemize the text forced as a string
            text, lang_switches = self._phonemize_aux(list2str(text),
                                                      separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # divide the input text in chunks, each chunk being processed in a
            # separate job
            text_chunks = chunks(text, njobs)

            # offset used below to recover the line numbers in the input text
            # wrt the chunks
            offset = [0] + cumsum(
                (c.count('\n') + 1 for c in text_chunks[:-1]))

            # we have here a list of (phonemized chunk, lang_switches)
            output = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in text_chunks)

            # flatten both the phonemized chunks and language switches in a
            # list. For language switches lines we need to add an offset to
            # have the correct lines numbers wrt the input text.
            text = list(itertools.chain(*(chunk[0] for chunk in output)))
            lang_switches = [chunk[1] for chunk in output]
            for i in range(len(lang_switches)):
                for j in range(len(lang_switches[i])):
                    lang_switches[i][j] += offset[i]
            lang_switches = list(itertools.chain(*lang_switches))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # warn the user if language switches occured during phonemization
        self._warn_on_lang_switch(lang_switches)

        # finally restore the punctuation
        return self._phonemize_postprocess(text, text_type, punctuation_marks)
Esempio n. 4
0
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        # remember the text type for output (either list or string)
        text_type = type(text)

        # deals with punctuation: remove it and keep track of it for
        # restoration at the end if asked for
        punctuation_marks = []
        if self.preserve_punctuation:
            text, punctuation_marks = self._punctuator.preserve(text)
        else:
            text = self._punctuator.remove(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))
Esempio n. 5
0
def test_chunks():
    for n in range(1, 5):
        c = chunks(['a'], n)
        assert c == ['a']

    assert chunks(['a', 'a'], 1) == ['a\na']
    assert chunks(['a', 'a'], 2) == ['a', 'a']
    assert chunks(['a', 'a'], 10) == ['a', 'a']

    assert chunks(['a', 'a', 'a'], 1) == ['a\na\na']
    assert chunks(['a', 'a', 'a'], 2) == ['a', 'a\na']
    assert chunks(['a', 'a', 'a'], 3) == ['a', 'a', 'a']
    assert chunks(['a', 'a', 'a'], 10) == ['a', 'a', 'a']

    assert chunks(['a', 'a', 'a', 'a'], 1) == ['a\na\na\na']
    assert chunks(['a', 'a', 'a', 'a'], 2) == ['a\na', 'a\na']
    assert chunks(['a', 'a', 'a', 'a'], 3) == ['a', 'a', 'a\na']
    assert chunks(['a', 'a', 'a', 'a'], 10) == ['a', 'a', 'a', 'a']
Esempio n. 6
0
def test_chunks():
    for i in range(1, 5):
        assert chunks(['a'], i) == ([['a']], [0])

    assert chunks(['a', 'a'], 1) == ([['a', 'a']], [0])
    assert chunks(['a', 'a'], 2) == ([['a'], ['a']], [0, 1])
    assert chunks(['a', 'a'], 10) == ([['a'], ['a']], [0, 1])

    assert chunks(['a', 'a', 'a'], 1) == ([['a', 'a', 'a']], [0])
    assert chunks(['a', 'a', 'a'], 2) == ([['a'], ['a', 'a']], [0, 1])
    assert chunks(['a', 'a', 'a'], 3) == ([['a'], ['a'], ['a']], [0, 1, 2])
    assert chunks(['a', 'a', 'a'], 10) == ([['a'], ['a'], ['a']], [0, 1, 2])

    assert chunks(['a', 'a', 'a', 'a'], 1) == ([['a', 'a', 'a', 'a']], [0])
    assert chunks(['a', 'a', 'a', 'a'], 2) == (
        [['a', 'a'], ['a', 'a']], [0, 2])
    assert chunks(['a', 'a', 'a', 'a'], 3) == (
        [['a'], ['a'], ['a', 'a']], [0, 1, 2])
    assert chunks(['a', 'a', 'a', 'a'], 10) == (
        [['a'], ['a'], ['a'], ['a']], [0, 1, 2, 3])