Esempio n. 1
0
def _phonemize(  # pylint: disable=too-many-arguments
        backend, text, separator, strip, njobs, prepend_text):
    """Auxiliary function to phonemize()

    Does the phonemization and returns the phonemized text. Raises a
    RuntimeError on error.

    """
    # remember the text type for output (either list or string), force the text
    # as a list and ignore empty lines
    text_type = type(text)
    text = (line.strip(os.linesep) for line in str2list(text))
    text = [line for line in text if line.strip()]

    # phonemize the text
    phonemized = backend.phonemize(text,
                                   separator=separator,
                                   strip=strip,
                                   njobs=njobs)

    # at that point, the phonemized text is a list of str. Format it as
    # expected by the parameters
    if prepend_text:
        return list(zip(text, phonemized))
    if text_type == str:
        return list2str(phonemized)
    return phonemized
Esempio n. 2
0
    def _phonemize_postprocess(self, text, text_type, punctuation_marks):
        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))
Esempio n. 3
0
    def restore(cls, text, marks):
        """Restore punctuation in a text.

        This is the reverse operation of Punctuation.preserve(). It takes a
        list of punctuated chunks and a list of punctuation marks. It returns a
        a punctuated text as a list:

            ['hello', 'my world'], [',', '!'] -> ['hello, my world!']

        """
        return cls._restore_aux(str2list(text), marks, 0)
Esempio n. 4
0
def test_str2list():
    assert str2list('') == ['']
    assert str2list('a') == ['a']
    assert str2list('ab') == ['ab']
    assert str2list('a b') == ['a b']
    assert str2list('a\nb') == ['a', 'b']
    assert str2list('a\n\nb\n') == ['a', '', 'b']
Esempio n. 5
0
def test_str2list():
    assert str2list('') == ['']
    assert str2list('a') == ['a']
    assert str2list('ab') == ['ab']
    assert str2list('a b') == ['a b']
    assert str2list(f'a{os.linesep}b') == ['a', 'b']
    assert str2list(
        f'a{os.linesep}{os.linesep}b{os.linesep}') == ['a', '', 'b']
Esempio n. 6
0
    def phonemize(self,
                  text,
                  separator=default_separator,
                  strip=False,
                  njobs=1):
        """Returns the `text` phonemized for the given language"""
        # remember the text type for output (either list or string)
        text_type = type(text)

        # deals with punctuation: remove it and keep track of it for
        # restoration at the end if asked for
        punctuation_marks = []
        if self.preserve_punctuation:
            text, punctuation_marks = self._punctuator.preserve(text)
        else:
            text = self._punctuator.remove(text)

        if njobs == 1:
            # phonemize the text forced as a string
            text = self._phonemize_aux(list2str(text), separator, strip)
        else:
            # If using parallel jobs, disable the log as stderr is not
            # picklable.
            self.logger.info('running %s on %s jobs', self.name(), njobs)
            log_storage = self.logger
            self.logger = None

            # we have here a list of phonemized chunks
            text = joblib.Parallel(n_jobs=njobs)(
                joblib.delayed(self._phonemize_aux)(t, separator, strip)
                for t in chunks(text, njobs))

            # flatten them in a single list
            text = list(itertools.chain(*text))

            # restore the log as it was before parallel processing
            self.logger = log_storage

        # restore the punctuation is asked for
        if self.preserve_punctuation:
            text = self._punctuator.restore(text, punctuation_marks)

        # output the result formatted as a string or a list of strings
        # according to type(text)
        return (list2str(text)
                if text_type in six.string_types else str2list(text))
Esempio n. 7
0
    def preserve(self, text):
        """Removes punctuation from `text`, allowing for furter restoration

        This method returns the text as a list of punctuated chunks, along with
        a list of punctuation marks for furter restoration:

            'hello, my world!' -> ['hello', 'my world'], [',', '!']

        """
        text = str2list(text)
        preserved_text = []
        preserved_marks = []

        for num, line in enumerate(text):
            line, marks = self._preserve_line(line, num)
            preserved_text += line
            preserved_marks += marks
        return [line for line in preserved_text if line], preserved_marks