def __init__(
        self,
        g2p_type: str,
        non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
        space_symbol: str = "<space>",
        remove_non_linguistic_symbols: bool = False,
    ):
        assert check_argument_types()
        if g2p_type == "g2p_en":
            self.g2p = g2p_en.G2p()
        elif g2p_type == "pyopenjtalk":
            self.g2p = pyopenjtalk_g2p
        elif g2p_type == "pyopenjtalk_kana":
            self.g2p = pyopenjtalk_g2p_kana
        else:
            raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")

        self.g2p_type = g2p_type
        self.space_symbol = space_symbol
        if non_linguistic_symbols is None:
            self.non_linguistic_symbols = set()
        elif isinstance(non_linguistic_symbols, (Path, str)):
            non_linguistic_symbols = Path(non_linguistic_symbols)
            with non_linguistic_symbols.open("r", encoding="utf-8") as f:
                self.non_linguistic_symbols = set(line.rstrip() for line in f)
        else:
            self.non_linguistic_symbols = set(non_linguistic_symbols)
        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
    def __call__(self, text) -> List[str]:
        if self.g2p is None:
            self.g2p = g2p_en.G2p()

        phones = self.g2p(text)
        if self.no_space:
            # remove space which represents word serapater
            phones = list(filter(lambda s: s != " ", phones))
        return phones
Beispiel #3
0
 def attempt_load_g2p_en(cls, model_dir=None):
     try:
         if model_dir:
             import nltk
             nltk.data.path.insert(
                 0, os.path.abspath(os.path.join(model_dir, 'g2p')))
         # g2p_en>=2.1.0
         import g2p_en
         cls.g2p_en = g2p_en.G2p()
         assert all(
             re.sub(r'[012]$', '', phone) in cls.CMU_to_XSAMPA_dict
             for phone in cls.g2p_en.phonemes if not phone.startswith('<'))
     except Exception:  # including ImportError
         cls.g2p_en = False  # Don't try anymore.
         _log.debug("failed to load g2p_en")
Beispiel #4
0
    def generate_pronunciations(cls, word):
        """returns CMU/arpabet phones"""
        if g2p_en:
            try:
                if not cls.g2p_en:
                    cls.g2p_en = g2p_en.G2p()
                phones = cls.g2p_en(word)
                _log.debug("generated pronunciation with g2p_en for %r: %r" %
                           (word, phones))
                return phones
            except Exception as e:
                _log.exception(
                    "generate_pronunciations exception using g2p_en")

        if True:
            try:
                files = {'wordfile': ('wordfile', word)}
                req = requests.post(
                    'http://www.speech.cs.cmu.edu/cgi-bin/tools/logios/lextool.pl',
                    files=files)
                req.raise_for_status()
                # FIXME: handle network failures
                match = re.search(r'<!-- DICT (.*)  -->', req.text)
                if match:
                    url = match.group(1)
                    req = requests.get(url)
                    req.raise_for_status()
                    entries = req.text.strip().split('\n')
                    pronunciations = []
                    for entry in entries:
                        tokens = entry.strip().split()
                        assert re.match(
                            word + r'(\(\d\))?', tokens[0],
                            re.I)  # 'SEMI-COLON' or 'SEMI-COLON(2)'
                        phones = tokens[1:]
                        _log.debug(
                            "generated pronunciation with cloud-cmudict for %r: CMU phones are %r"
                            % (word, phones))
                        pronunciations.append(phones)
                    return pronunciations
            except Exception as e:
                _log.exception(
                    "generate_pronunciations exception accessing www.speech.cs.cmu.edu"
                )

        raise KaldiError("cannot generate word pronunciation")
Beispiel #5
0
    def phonemize(
        cls,
        text: str,
        lang: Optional[str],
        phonemizer: Optional[str] = None,
        preserve_punct: bool = False,
        to_simplified_zh: bool = False,
    ):
        if to_simplified_zh:
            import hanziconv

            text = hanziconv.HanziConv.toSimplified(text)

        if phonemizer == "g2p":
            import g2p_en

            g2p = g2p_en.G2p()
            if preserve_punct:
                return " ".join("|" if p == " " else p for p in g2p(text))
            else:
                res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)]
                return " ".join(p for p in res if p.isalnum())
        if phonemizer == "g2pc":
            import g2pc

            g2p = g2pc.G2pC()
            return " ".join([w[3] for w in g2p(text)])
        elif phonemizer == "ipa":
            assert lang is not None
            import phonemizer
            from phonemizer.separator import Separator

            lang_map = {"en": "en-us", "fr": "fr-fr"}
            return phonemizer.phonemize(
                text,
                backend="espeak",
                language=lang_map.get(lang, lang),
                separator=Separator(word="| ", phone=" "),
            )
        else:
            return text
Beispiel #6
0
    def __init__(
        self,
        punct=True,
        stresses=False,
        spaces=True,
        *,
        space=' ',
        silence=None,
        oov=Base.OOV,
    ):
        if HAVE_G2P:
            Phonemes._G2P = g2p_en.G2p()
        else:
            raise ImportError(
                f"G2P could not be imported properly. Please attempt to import `g2p_py` "
                f"before using {self.__class__.__name__}.")

        labels = []
        self.space, labels = len(labels), labels + [space]  # Space
        if silence:
            self.silence, labels = len(labels), labels + [silence]  # Silence
        labels.extend(self.CONSONANTS)
        vowels = list(self.VOWELS)
        if stresses:
            vowels = [
                f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))
            ]
        labels.extend(vowels)
        labels.append("'")  # Apostrophe

        if punct:
            labels.extend(self.PUNCT)

        super().__init__(labels, oov=oov)

        self.punct = punct
        self.stresses = stresses
        self.spaces = spaces
Beispiel #7
0
    def __init__(
        self,
        punct=True,
        stresses=False,
        spaces=True,
        chars=False,
        *,
        space=' ',
        silence=None,
        apostrophe=True,
        oov=Base.OOV,
        sep='|',  # To be able to distinguish between 2/3 letters codes.
        add_blank_at="last_but_one",
        pad_with_space=False,
        improved_version_g2p=False,
        phoneme_dict_path=None,
    ):
        labels = []
        self.space, labels = len(labels), labels + [space]  # Space

        if silence is not None:
            self.silence, labels = len(labels), labels + [silence]  # Silence

        labels.extend(self.CONSONANTS)
        vowels = list(self.VOWELS)

        if stresses:
            vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))]
        labels.extend(vowels)

        if chars:
            labels.extend(string.ascii_lowercase)

        if apostrophe:
            labels.append("'")  # Apostrophe

        if punct:
            labels.extend(self.PUNCT)

        super().__init__(labels, oov=oov, sep=sep, add_blank_at=add_blank_at)

        self.punct = punct
        self.stresses = stresses
        self.spaces = spaces
        self.pad_with_space = pad_with_space

        # g2p_en tries to run download_corpora() on import but it is not rank zero guarded
        # Try to check if torch distributed is available, if not get global rank zero to download corpora and make
        # all other ranks sleep for a minute
        if torch.distributed.is_available() and torch.distributed.is_initialized():
            group = torch.distributed.group.WORLD
            if is_global_rank_zero():
                download_corpora()
            torch.distributed.barrier(group=group)
        elif is_global_rank_zero():
            logging.error(
                f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to "
                "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this "
                "before rank 0, errors might result."
            )
            download_corpora()
        else:
            logging.error(
                f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to "
                "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this "
                "rank wakes from sleep prior to rank 0 finishing downloading, errors might result."
            )
            time.sleep(60)

        import g2p_en  # noqa pylint: disable=import-outside-toplevel

        _g2p = g2p_en.G2p()
        _g2p.variables = None

        if improved_version_g2p:
            self.g2p = G2p(_g2p, phoneme_dict_path)
        else:
            self.g2p = _g2p
Beispiel #8
0
from nemo.collections.asr.parts import parsers

try:
    nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except LookupError:
    nltk.download('averaged_perceptron_tagger', quiet=True)
try:
    nltk.data.find('corpora/cmudict.zip')
except LookupError:
    nltk.download('cmudict', quiet=True)

try:
    import g2p_en  # noqa

    _g2p = g2p_en.G2p()
    _g2p.variables = None

    HAVE_G2P = True
except (FileNotFoundError, LookupError):
    HAVE_G2P = False


class Base(abc.ABC):
    """Vocabulary for turning str text to list of int tokens."""

    # fmt: off
    PUNCT = (  # Derived from LJSpeech
        ',',
        '.',
        '!',
Beispiel #9
0
    def __init__(
        self,
        punct=True,
        stresses=False,
        spaces=True,
        chars=False,
        *,
        space=' ',
        silence=None,
        apostrophe=True,
        oov=Base.OOV,
        sep='|',  # To be able to distinguish between 2/3 letters codes.
        add_blank_at="last_but_one",
        pad_with_space=False,
        improved_version_g2p=False,
        phoneme_dict_path=None,
    ):
        labels = []
        self.space, labels = len(labels), labels + [space]  # Space

        if silence is not None:
            self.silence, labels = len(labels), labels + [silence]  # Silence

        labels.extend(self.CONSONANTS)
        vowels = list(self.VOWELS)

        if stresses:
            vowels = [
                f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))
            ]
        labels.extend(vowels)

        if chars:
            labels.extend(string.ascii_lowercase)

        if apostrophe:
            labels.append("'")  # Apostrophe

        if punct:
            labels.extend(self.PUNCT)

        super().__init__(labels, oov=oov, sep=sep, add_blank_at=add_blank_at)

        self.punct = punct
        self.stresses = stresses
        self.spaces = spaces
        self.pad_with_space = pad_with_space

        download_corpora()
        _ = sync_ddp_if_available(
            torch.tensor(0))  # Barrier until rank 0 downloads the corpora

        # g2p_en tries to run download_corpora() on import but it is not rank zero guarded
        import g2p_en  # noqa pylint: disable=import-outside-toplevel

        _g2p = g2p_en.G2p()
        _g2p.variables = None

        if improved_version_g2p:
            self.g2p = G2p(_g2p, phoneme_dict_path)
        else:
            self.g2p = _g2p
Beispiel #10
0
 def __init__(self, delimit=' '):
     self._delimit = delimit
     self._g2pen = g2p_en.G2p()
     self._g2pvn = G2pVn(try_other=self._g2pen)
Beispiel #11
0
class Phonemes(Base):
    """Phonemes vocabulary."""

    _G2P = g2p_en.G2p()

    SEP = '|'  # To be able to distinguish between 2/3 letters codes.
    # fmt: off
    VOWELS = (
        'AA', 'AE', 'AH', 'AO', 'AW',
        'AY', 'EH', 'ER', 'EY', 'IH',
        'IY', 'OW', 'OY', 'UH', 'UW',
    )
    CONSONANTS = (
        'B', 'CH', 'D', 'DH', 'F', 'G',
        'HH', 'JH', 'K', 'L', 'M', 'N',
        'NG', 'P', 'R', 'S', 'SH', 'T',
        'TH', 'V', 'W', 'Y', 'Z', 'ZH',
    )
    # fmt: on

    def __init__(
        self, punct=True, stresses=False, spaces=True, *, space=' ', silence=None, oov=Base.OOV,
    ):
        labels = []
        self.space, labels = len(labels), labels + [space]  # Space
        if silence:
            self.silence, labels = len(labels), labels + [silence]  # Silence
        labels.extend(self.CONSONANTS)
        vowels = list(self.VOWELS)
        if stresses:
            vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))]
        labels.extend(vowels)
        labels.append("'")  # Apostrophe

        if punct:
            labels.extend(self.PUNCT)

        super().__init__(labels, oov=oov)

        self.punct = punct
        self.stresses = stresses
        self.spaces = spaces

    def encode(self, text):
        """See base class."""
        ps, space = [], self.labels[self.space]

        for p in self._G2P(text):
            if len(p) == 3 and not self.stresses:
                p = p[:2]

            if p == space and ps[-1] != space:
                ps.append(p)

            if p.isalnum() or p == "'":
                ps.append(p)

            if p in self.PUNCT and self.punct:
                if not self.spaces and len(ps) and ps[-1] == space:
                    ps.pop()

                ps.append(p)

        if ps[-1] == space:
            ps.pop()

        return [self._label2id[p] for p in ps]