def general_trans(self, text, filter_func, normpunc=False, ligatures=False): """Transliaterates a word into IPA, filtering with filter_func Args: text (str): word to transcribe; unicode strings filter_func (function): function for filtering segments; takes a <segment, is_ipa> tuple and returns a boolean. normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string, filtered by filter_func. """ text = unicode(text) text = unicodedata.normalize('NFD', text.lower()) logging.debug('(after norm) text=' + repr(list(text))) text = self.strip_diacritics.process(text) logging.debug('(after strip) text=' + repr(list(text))) if self.preproc: text = self.preprocessor.process(text) logging.debug('(after preproc) text=' + repr(list(text))) tr_list = [] while text: logging.debug('text=' + repr(list(text))) m = self.regexp.match(text) if m: source = m.group(0) try: target = self.g2p[source][0] except KeyError: logging.debug("source = '{}'".format(source)) logging.debug("self.g2p[source] = '{}'" .format(self.g2p[source])) target = source except IndexError: logging.debug("self.g2p[source]={}".format(self.g2p[source])) target = source tr_list.append((target, True)) text = text[len(source):] else: tr_list.append((text[0], False)) self.nils[text[0]] += 2 text = text[1:] text = ''.join([s for (s, _) in filter(filter_func, tr_list)]) if self.postproc: text = self.postprocessor.process(text) if ligatures or self.ligatures: text = ligaturize(text) if normpunc: text = self.puncnorm.norm(text) return unicodedata.normalize('NFC', text)
def general_trans(self, text, filter_func, normpunc=False, ligatures=False): """Transliaterates a word into IPA, filtering with filter_func Args: text (str): word to transcribe; unicode strings filter_func (function): function for filtering segments; takes a <segment, is_ipa> tuple and returns a boolean. normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string, filtered by filter_func. """ text = unicode(text) text = self.strip_diacritics.process(text) text = unicodedata.normalize('NFC', text.lower()) if self.preproc: text = self.preprocessor.process(text) tr_list = [] while text: m = self.regexp.match(text) if m: source = m.group(0) try: target = self.g2p[source][0] except KeyError: logging.debug("source = '{}'".format(source)) logging.debug("self.g2p[source] = '{}'" .format(self.g2p[source])) target = source tr_list.append((target, True)) text = text[len(source):] else: tr_list.append((text[0], False)) self.nils[text[0]] += 2 text = text[1:] text = ''.join([s for (s, _) in filter(filter_func, tr_list)]) if self.postproc: text = self.postprocessor.process(text) if ligatures or self.ligatures: text = ligaturize(text) if normpunc: text = self.puncnorm.norm(text) return text
def transliterate(self, text, normpunc=False, ligatures=False, safe=True): """Transliterates/transcribes a word into IPA Passes unmapped characters through to output unchanged. Args: word (str): word to transcribe; unicode string normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string with unrecognized characters included """ text = unicode(text) if safe and not self.regexp.search(text): return text text = self.strip_diacritics.process(text) text = unicodedata.normalize('NFC', text.lower()) if self.preproc: text = self.preprocessor.process(text) tr_list = [] while text: m = self.regexp.match(text) if m: from_seg = m.group(0) try: to_seg = self.g2p[from_seg][0] except: print("from_seg = {}".format(from_seg)) print("self.g2p[from_seg] = {}".format(self.g2p[from_seg])) to_seg = from_seg tr_list.append(to_seg) text = text[len(from_seg):] else: tr_list.append(text[0]) self.nils[text[0]] += 1 text = text[1:] text = ''.join(tr_list) if self.postproc: text = self.postprocessor.process(text) if ligatures or self.ligatures: text = ligaturize(text) if normpunc: text = self.puncnorm.norm(text) return text
def transliterate(self, text, normpunc=False, ligatures=False): """Convert English text to IPA transcription Args: text (unicode): English text normpunc (bool): if True, normalize punctuation downward ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ text = unicodedata.normalize('NFC', text) acc = [] for chunk in self.chunk_re.findall(text): if self.letter_re.match(chunk): acc.append(self.english_g2p(chunk)) else: acc.append(chunk) text = ''.join(acc) text = self.puncnorm.norm(text) if normpunc else text text = ligaturize(text) if (ligatures or self.ligatures) else text return text
def transliterate(self, text, normpunc=False, ligatures=False): """Convert English text to IPA transcription Args: text (unicode): English text normpunc (bool): if True, normalize punctuation downward ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ text = unicodedata.normalize('NFC', text) acc = [] for chunk in self.chunk_re.findall(text): if self.letter_re.match(chunk): acc.append(self.english_g2p(chunk)) else: acc.append(chunk) text = ''.join(acc) text = self.puncnorm.norm(text) if normpunc else text text = ligaturize(text) if ligatures else text return text