Beispiel #1
0
    def general_trans(self, text, filter_func,
                      normpunc=False, ligatures=False):
        """Transliaterates a word into IPA, filtering with filter_func

        Args:
            text (str): word to transcribe; unicode strings
            filter_func (function): function for filtering segments; takes
                                    a <segment, is_ipa> tuple and returns a
                                    boolean.
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of
                              standard IPA

        Returns:
            unicode: IPA string, filtered by filter_func.
        """
        text = unicode(text)
        text = unicodedata.normalize('NFD', text.lower())
        logging.debug('(after norm) text=' + repr(list(text)))
        text = self.strip_diacritics.process(text)
        logging.debug('(after strip) text=' + repr(list(text)))
        if self.preproc:
            text = self.preprocessor.process(text)
        logging.debug('(after preproc) text=' + repr(list(text)))
        tr_list = []
        while text:
            logging.debug('text=' + repr(list(text)))
            m = self.regexp.match(text)
            if m:
                source = m.group(0)
                try:
                    target = self.g2p[source][0]
                except KeyError:
                    logging.debug("source = '{}'".format(source))
                    logging.debug("self.g2p[source] = '{}'"
                                  .format(self.g2p[source]))
                    target = source
                except IndexError:
                    logging.debug("self.g2p[source]={}".format(self.g2p[source]))
                    target = source
                tr_list.append((target, True))
                text = text[len(source):]
            else:
                tr_list.append((text[0], False))
                self.nils[text[0]] += 2
                text = text[1:]
        text = ''.join([s for (s, _) in filter(filter_func, tr_list)])
        if self.postproc:
            text = self.postprocessor.process(text)
        if ligatures or self.ligatures:
            text = ligaturize(text)
        if normpunc:
            text = self.puncnorm.norm(text)
        return unicodedata.normalize('NFC', text)
Beispiel #2
0
    def general_trans(self, text, filter_func,
                      normpunc=False, ligatures=False):
        """Transliaterates a word into IPA, filtering with filter_func

        Args:
            text (str): word to transcribe; unicode strings
            filter_func (function): function for filtering segments; takes
                                    a <segment, is_ipa> tuple and returns a
                                    boolean.
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of
                              standard IPA

        Returns:
            unicode: IPA string, filtered by filter_func.
        """
        text = unicode(text)
        text = self.strip_diacritics.process(text)
        text = unicodedata.normalize('NFC', text.lower())
        if self.preproc:
            text = self.preprocessor.process(text)
        tr_list = []
        while text:
            m = self.regexp.match(text)
            if m:
                source = m.group(0)
                try:
                    target = self.g2p[source][0]
                except KeyError:
                    logging.debug("source = '{}'".format(source))
                    logging.debug("self.g2p[source] = '{}'"
                                  .format(self.g2p[source]))
                    target = source
                tr_list.append((target, True))
                text = text[len(source):]
            else:
                tr_list.append((text[0], False))
                self.nils[text[0]] += 2
                text = text[1:]
        text = ''.join([s for (s, _) in filter(filter_func, tr_list)])
        if self.postproc:
            text = self.postprocessor.process(text)
        if ligatures or self.ligatures:
            text = ligaturize(text)
        if normpunc:
            text = self.puncnorm.norm(text)
        return text
Beispiel #3
0
    def transliterate(self, text, normpunc=False, ligatures=False, safe=True):
        """Transliterates/transcribes a word into IPA

        Passes unmapped characters through to output unchanged.

        Args:
            word (str): word to transcribe; unicode string
            normpunc (bool): normalize punctuation
            ligatures (bool): use precomposed ligatures instead of standard IPA

        Returns:
            unicode: IPA string with unrecognized characters included
        """
        text = unicode(text)
        if safe and not self.regexp.search(text):
            return text
        text = self.strip_diacritics.process(text)
        text = unicodedata.normalize('NFC', text.lower())
        if self.preproc:
            text = self.preprocessor.process(text)
        tr_list = []
        while text:
            m = self.regexp.match(text)
            if m:
                from_seg = m.group(0)
                try:
                    to_seg = self.g2p[from_seg][0]
                except:
                    print("from_seg = {}".format(from_seg))
                    print("self.g2p[from_seg] = {}".format(self.g2p[from_seg]))
                    to_seg = from_seg
                tr_list.append(to_seg)
                text = text[len(from_seg):]
            else:
                tr_list.append(text[0])
                self.nils[text[0]] += 1
                text = text[1:]
        text = ''.join(tr_list)
        if self.postproc:
            text = self.postprocessor.process(text)
        if ligatures or self.ligatures:
            text = ligaturize(text)
        if normpunc:
            text = self.puncnorm.norm(text)
        return text
    def transliterate(self, text, normpunc=False, ligatures=False):
        """Convert English text to IPA transcription

        Args:
            text (unicode): English text
            normpunc (bool): if True, normalize punctuation downward
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
        """
        text = unicodedata.normalize('NFC', text)
        acc = []
        for chunk in self.chunk_re.findall(text):
            if self.letter_re.match(chunk):
                acc.append(self.english_g2p(chunk))
            else:
                acc.append(chunk)
        text = ''.join(acc)
        text = self.puncnorm.norm(text) if normpunc else text
        text = ligaturize(text) if (ligatures or self.ligatures) else text
        return text
Beispiel #5
0
    def transliterate(self, text, normpunc=False, ligatures=False):
        """Convert English text to IPA transcription

        Args:
            text (unicode): English text
            normpunc (bool): if True, normalize punctuation downward
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
        """
        text = unicodedata.normalize('NFC', text)
        acc = []
        for chunk in self.chunk_re.findall(text):
            if self.letter_re.match(chunk):
                acc.append(self.english_g2p(chunk))
            else:
                acc.append(chunk)
        text = ''.join(acc)
        text = self.puncnorm.norm(text) if normpunc else text
        text = ligaturize(text) if ligatures else text
        return text