コード例 #1
0
ファイル: base.py プロジェクト: welgazil/phonemizer
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        text, text_type, punctuation_marks = self._phonemize_preprocess(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        return self._phonemize_postprocess(text, text_type, punctuation_marks)
コード例 #2
0
def _phonemize(  # pylint: disable=too-many-arguments
        backend, text, separator, strip, njobs, prepend_text):
    """Auxiliary function to phonemize()

    Does the phonemization and returns the phonemized text. Raises a
    RuntimeError on error.

    """
    # remember the text type for output (either list or string), force the text
    # as a list and ignore empty lines
    text_type = type(text)
    text = (line.strip(os.linesep) for line in str2list(text))
    text = [line for line in text if line.strip()]

    # phonemize the text
    phonemized = backend.phonemize(text,
                                   separator=separator,
                                   strip=strip,
                                   njobs=njobs)

    # at that point, the phonemized text is a list of str. Format it as
    # expected by the parameters
    if prepend_text:
        return list(zip(text, phonemized))
    if text_type == str:
        return list2str(phonemized)
    return phonemized
コード例 #3
0
ファイル: base.py プロジェクト: slbinilkumar/phonemizer
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        # remember the text type for output (either list or string)
        text_type = type(text)

        # deals with punctuation: remove it and keep track of it for
        # restoration at the end if asked for
        punctuation_marks = []
        if self.preserve_punctuation:
            text, punctuation_marks = self._punctuator.preserve(text)
        else:
            text = self._punctuator.remove(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))
コード例 #4
0
    def _phonemize_postprocess(self, text, text_type, punctuation_marks):
        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))
コード例 #5
0
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        text, text_type, punctuation_marks = self._phonemize_preprocess(text)
        lang_switches = []

        if njobs == 1:
            # phonemize the text forced as a string
            text, lang_switches = self._phonemize_aux(list2str(text),
                                                      separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # divide the input text in chunks, each chunk being processed in a
            # separate job
            text_chunks = chunks(text, njobs)

            # offset used below to recover the line numbers in the input text
            # wrt the chunks
            offset = [0] + cumsum(
                (c.count('\n') + 1 for c in text_chunks[:-1]))

            # we have here a list of (phonemized chunk, lang_switches)
            output = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in text_chunks)

            # flatten both the phonemized chunks and language switches in a
            # list. For language switches lines we need to add an offset to
            # have the correct lines numbers wrt the input text.
            text = list(itertools.chain(*(chunk[0] for chunk in output)))
            lang_switches = [chunk[1] for chunk in output]
            for i in range(len(lang_switches)):
                for j in range(len(lang_switches[i])):
                    lang_switches[i][j] += offset[i]
            lang_switches = list(itertools.chain(*lang_switches))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # warn the user if language switches occured during phonemization
        self._warn_on_lang_switch(lang_switches)

        # finally restore the punctuation
        return self._phonemize_postprocess(text, text_type, punctuation_marks)
コード例 #6
0
def test_list2str():
    assert list2str('') == ''
    assert list2str([]) == ''
    assert list2str(['']) == ''
    assert list2str(['abc']) == 'abc'
    assert list2str(['a', 'b', 'c']) == 'a\nb\nc'
コード例 #7
0
ファイル: test_utils.py プロジェクト: resemble-ai/phonemizer
def test_list2str():
    assert list2str('') == ''
    assert list2str([]) == ''
    assert list2str(['']) == ''
    assert list2str(['abc']) == 'abc'
    assert list2str(['a', 'b', 'c']) == os.linesep.join('abc')