def __init__(self, code, preproc=True, postproc=True, ligatures=False, rev=False, rev_preproc=True, rev_postproc=True): """Constructs the backend object epitran uses for most languages Args: code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen preproc (bool): if True, apply preprocessor postproc (bool): if True, apply postprocessors ligatures (bool): if True, use phonetic ligatures for affricates instead of standard IPA rev (bool): if True, load reverse transliteration rev_preproc (bool): if True, apply preprocessor when reverse transliterating rev_postproc (bool): if True, apply postprocessor when reverse transliterating """ self.rev = rev self.g2p = self._load_g2p_map(code, False) self.regexp = self._construct_regex(self.g2p.keys()) self.puncnorm = PuncNorm() self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names) self.preprocessor = PrePostProcessor(code, 'pre', False) self.postprocessor = PrePostProcessor(code, 'post', False) self.strip_diacritics = StripDiacritics(code) self.preproc = preproc self.postproc = postproc self.ligatures = ligatures self.rev_preproc = rev_preproc self.rev_postproc = rev_postproc if rev: self.rev_g2p = self._load_g2p_map(code, True) self.rev_regexp = self._construct_regex(self.rev_g2p.keys()) self.rev_preprocessor = PrePostProcessor(code, 'pre', True) self.rev_postprocessor = PrePostProcessor(code, 'post', True) self.nils = defaultdict(int)
def __init__(self, code, preproc=True, postproc=True, ligatures=False, cedict_file=None, rev=False, rev_preproc=True, rev_postproc=True): """Construct Epitran transliteration/transcription object Args: code (str): ISO 639-3 plus "-" plus ISO 15924 code of the language/script pair that should be loaded preproc (bool): apply preprocessors postproc (bool): apply prostprocessors ligatures (bool): use precomposed ligatures instead of standard IPA cedict_filename (str): path to file containing the CC-CEDict dictionary; relevant only for Chinese rev (boolean): if True, load reverse transliteration rev_preproc (bool): if True, apply preprocessor when reverse transliterating rev_postproc (bool): if True, apply postprocessor when reverse transliterating """ if code in self.special: self.epi = self.special[code](ligatures=ligatures, cedict_file=cedict_file) else: self.epi = SimpleEpitran(code, preproc, postproc, ligatures, rev, rev_preproc, rev_postproc) self.ft = panphon.featuretable.FeatureTable() self.xsampa = XSampa() self.puncnorm = PuncNorm()
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable()
def __init__(self, arpabet='arpabet', ligatures=False, **kwargs): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ arpabet = pkg_resources.resource_filename( __name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.regexp = re.compile(r'[A-Za-z]') self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names)
def __init__(self, lang_script_codes, cedict_file=None): """Construct a Backoff objectself. Args: lang_script_codes (list): codes for languages to try, starting with the highest priority languages cedict_file (str): path to the CC-CEdict dictionary file (necessary only when cmn-Hans or cmn-Hant are used) """ self.langs = [_epitran.Epitran(c, cedict_file=cedict_file) for c in lang_script_codes] self.num_re = re.compile(r'\p{Number}+') self.ft = panphon.featuretable.FeatureTable() self.xsampa = XSampa() self.puncnorm = PuncNorm() self.dias = [StripDiacritics(c) for c in lang_script_codes]
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.regexp = re.compile(r'[A-Za-z]') self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable()
class Flite(object): """English G2P using the Flite speech synthesis system.""" def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename( __name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.regexp = re.compile(r'[A-Za-z]') self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable() def _read_arpabet(self, arpabet): arpa_map = {} with open(arpabet, 'rb') as f: reader = csv.reader(f, encoding='utf-8') for arpa, ipa in reader: arpa_map[arpa] = ipa return arpa_map def normalize(self, text): text = unicode(text) text = unicodedata.normalize('NFD', text) text = ''.join(filter(lambda x: x in string.printable, text)) return text def arpa_text_to_list(self, arpa_text): return arpa_text.split(' ')[1:-1] def arpa_to_ipa(self, arpa_text, ligatures=False): arpa_text = arpa_text.strip() arpa_list = self.arpa_text_to_list(arpa_text) arpa_list = map(lambda d: re.sub('\d', '', d), arpa_list) ipa_list = map(lambda d: self.arpa_map[d], arpa_list) text = ''.join(ipa_list) return text def transliterate(self, text, normpunc=False, ligatures=False): """Convert English text to IPA transcription Args: text (unicode): English text normpunc (bool): if True, normalize punctuation downward ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ text = unicodedata.normalize('NFC', text) acc = [] for chunk in self.chunk_re.findall(text): if self.letter_re.match(chunk): acc.append(self.english_g2p(chunk)) else: acc.append(chunk) text = ''.join(acc) text = self.puncnorm.norm(text) if normpunc else text text = ligaturize(text) if (ligatures or self.ligatures) else text return text def strict_trans(self, text, normpunc=False, ligatures=False): return self.transliterate(text, normpunc, ligatures) def word_to_tuples(self, word, normpunc=False): """Given a word, returns a list of tuples corresponding to IPA segments. Args: word (unicode): word to transliterate normpunc (bool): If True, normalizes punctuation to ASCII inventory Returns: list: A list of (category, lettercase, orthographic_form, phonetic_form, feature_vectors) tuples. The "feature vectors" form a list consisting of (segment, vector) pairs. For IPA segments, segment is a substring of phonetic_form such that the concatenation of all segments in the list is equal to the phonetic_form. The vectors are a sequence of integers drawn from the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to '+'. """ def cat_and_cap(c): cat, case = tuple(unicodedata.category(c)) case = 1 if case == 'u' else 0 return unicode(cat), case def recode_ft(ft): try: return {'+': 1, '0': 0, '-': -1}[ft] except KeyError: return None def vec2bin(vec): return map(recode_ft, vec) def to_vector(seg): return seg, vec2bin(self.ft.segment_to_vector(seg)) def to_vectors(phon): if phon == '': return [(-1, [0] * self.num_panphon_fts)] else: return [to_vector(seg) for seg in self.ft.segs(phon)] tuples = [] word = unicode(word) # word = self.strip_diacritics.process(word) word = unicodedata.normalize('NFKD', word) word = unicodedata.normalize('NFC', word) while word: match = re.match('[A-Za-z]+', word) if match: span = match.group(0) cat, case = cat_and_cap(span[0]) phonword = self.transliterate(span) phonsegs = self.ft.segs(phonword) maxlen = max(len(phonsegs), len(span)) orth = list(span) + [''] * (maxlen - len(span)) phonsegs += [''] * (maxlen - len(phonsegs)) for p, o in zip(phonsegs, orth): tuples.append(('L', case, o, p, to_vectors(p))) word = word[len(span):] else: span = word[0] span = self.puncnorm.norm(span) if normpunc else span cat, case = cat_and_cap(span) cat = 'P' if normpunc and cat in self.puncnorm else cat phon = '' vecs = to_vectors(phon) tuples.append((cat, case, span, phon, vecs)) word = word[1:] return tuples
class SimpleEpitran(object): def __init__(self, code, preproc=True, postproc=True, ligatures=False): """Constructs the backend object epitran uses for most languages Args: code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen preproc (bool): if True, apply preprocessor postproc (bool): if True, apply postprocessors ligatures (bool): if True, use phonetic ligatures for affricates instead of standard IPA """ self.g2p = self._load_g2p_map(code) self.regexp = self._construct_regex() self.puncnorm = PuncNorm() self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names) self.preprocessor = PrePostProcessor(code, 'pre') self.postprocessor = PrePostProcessor(code, 'post') self.strip_diacritics = StripDiacritics(code) self.preproc = preproc self.postproc = postproc self.ligatures = ligatures self.nils = defaultdict(int) def __enter__(self): return self def __exit__(self, type_, val, tb): for nil, count in self.nils.items(): sys.stderr.write('Unknown character "{}" occured {} times.\n'.format(nil, count)) def _one_to_many_gr_by_line_map(self, gr_by_line): for g, ls in gr_by_line.items(): if len(ls) != 1: return (g, ls) return None def _load_g2p_map(self, code): """Load the code table for the specified language. Args: code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the language/script to be loaded """ g2p = defaultdict(list) gr_by_line = defaultdict(list) try: path = os.path.join('data', 'map', code + '.csv') path = pkg_resources.resource_filename(__name__, path) except IndexError: raise DatafileError('Add an appropriately-named mapping to the data/maps directory.') with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8') next(reader) for (i, fields) in enumerate(reader): try: graph, phon = fields except ValueError: raise DatafileError('Map file is not well formed at line {}.'.format(i + 2)) graph = unicodedata.normalize('NFC', graph) phon = unicodedata.normalize('NFC', phon) g2p[graph].append(phon) gr_by_line[graph].append(i) if self._one_to_many_gr_by_line_map(g2p): graph, lines = self._one_to_many_gr_by_line_map(gr_by_line) lines = [l + 2 for l in lines] raise MappingError('One-to-many G2P mapping for "{}" on lines {}'.format(graph, ', '.join(map(str, lines))).encode('utf-8')) return g2p def _load_punc_norm_map(self): """Load the map table for normalizing 'down' punctuation.""" path = os.path.join('data', 'puncnorm.csv') path = pkg_resources.resource_filename(__name__, path) with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8', delimiter=str(','), quotechar=str('"')) next(reader) return {punc: norm for (punc, norm) in reader} def _construct_regex(self): """Build a regular expression that will greadily match segments from the mapping table. """ graphemes = sorted(self.g2p.keys(), key=len, reverse=True) return re.compile(r'({})'.format(r'|'.join(graphemes)), re.I) def general_trans(self, text, filter_func, normpunc=False, ligatures=False): """Transliaterates a word into IPA, filtering with filter_func Args: text (str): word to transcribe; unicode strings filter_func (function): function for filtering segments; takes a <segment, is_ipa> tuple and returns a boolean. normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string, filtered by filter_func. """ text = unicode(text) text = self.strip_diacritics.process(text) text = unicodedata.normalize('NFC', text.lower()) if self.preproc: text = self.preprocessor.process(text) tr_list = [] while text: m = self.regexp.match(text) if m: source = m.group(0) try: target = self.g2p[source][0] except KeyError: logging.debug("source = '{}'".format(source)) logging.debug("self.g2p[source] = '{}'" .format(self.g2p[source])) target = source tr_list.append((target, True)) text = text[len(source):] else: tr_list.append((text[0], False)) self.nils[text[0]] += 2 text = text[1:] text = ''.join([s for (s, _) in filter(filter_func, tr_list)]) if self.postproc: text = self.postprocessor.process(text) if ligatures or self.ligatures: text = ligaturize(text) if normpunc: text = self.puncnorm.norm(text) return text def transliterate(self, text, normpunc=False, ligatures=False): """Transliterates/transcribes a word into IPA Passes unmapped characters through to output unchanged. Args: word (str): word to transcribe; unicode string normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string with unrecognized characters included """ return self.general_trans(text, lambda x: True, normpunc, ligatures) def strict_trans(self, text, normpunc=False, ligatures=False): """Transliterates/transcribes a word into IPA Ignores umapped characters. Args: word (str): word to transcribe; unicode string normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string """ return self.general_trans(text, lambda x: x[1], normpunc, ligatures) def word_to_tuples(self, word, normpunc=False): """Given a word, returns a list of tuples corresponding to IPA segments. Args: word (unicode): word to transliterate normpunc (bool): If True, normalizes punctuation to ASCII inventory Returns: list: A list of (category, lettercase, orthographic_form, phonetic_form, feature_vectors) tuples. The "feature vectors" form a list consisting of (segment, vector) pairs. For IPA segments, segment is a substring of phonetic_form such that the concatenation of all segments in the list is equal to the phonetic_form. The vectors are a sequence of integers drawn from the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to '+'. """ def cat_and_cap(c): cat, case = tuple(unicodedata.category(c)) case = 1 if case == 'u' else 0 return unicode(cat), case def recode_ft(ft): try: return {'+': 1, '0': 0, '-': -1}[ft] except KeyError: return None def vec2bin(vec): return list(map(recode_ft, vec)) def to_vector(seg): return seg, vec2bin(self.ft.segment_to_vector(seg)) def to_vectors(phon): if phon == '': return [(-1, [0] * self.num_panphon_fts)] else: return [to_vector(seg) for seg in self.ft.ipa_segs(phon)] tuples = [] word = unicode(word) word = self.strip_diacritics.process(word) word = unicodedata.normalize('NFC', word) if self.preproc: word = self.preprocessor.process(word) while word: match = self.regexp.match(word) if match: span = match.group(1) cat, case = cat_and_cap(span[0]) phon = self.g2p[span.lower()][0] vecs = to_vectors(phon) tuples.append(('L', case, span, phon, vecs)) word = word[len(span):] else: span = word[0] span = self.puncnorm.norm(span) if normpunc else span cat, case = cat_and_cap(span) cat = 'P' if normpunc and cat in self.puncnorm else cat phon = '' vecs = to_vectors(phon) tuples.append((cat, case, span, phon, vecs)) word = word[1:] return tuples def ipa_segs(self, ipa): """Given an IPA string, decompose it into a list of segments Args: ipa (unicode): a Unicode IPA string Returns: list: a list of unicode strings corresponding to segments (consonants and vowels) in the input string """ return self.ft.segs(ipa)
class Flite(object): """English G2P using the Flite speech synthesis system.""" def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U) self.letter_re = re.compile(r"[A-Za-z'’]+") self.regexp = re.compile(r'[A-Za-z]') self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable() def _read_arpabet(self, arpabet): arpa_map = {} with open(arpabet, 'rb') as f: reader = csv.reader(f, encoding='utf-8') for arpa, ipa in reader: arpa_map[arpa] = ipa return arpa_map def normalize(self, text): text = unicode(text) text = unicodedata.normalize('NFD', text) text = ''.join(filter(lambda x: x in string.printable, text)) return text def arpa_text_to_list(self, arpa_text): return arpa_text.split(' ')[1:-1] def arpa_to_ipa(self, arpa_text, ligatures=False): arpa_text = arpa_text.strip() arpa_list = self.arpa_text_to_list(arpa_text) arpa_list = map(lambda d: re.sub('\d', '', d), arpa_list) ipa_list = map(lambda d: self.arpa_map[d], arpa_list) text = ''.join(ipa_list) return text def transliterate(self, text, normpunc=False, ligatures=False): """Convert English text to IPA transcription Args: text (unicode): English text normpunc (bool): if True, normalize punctuation downward ligatures (bool): if True, use non-standard ligatures instead of standard IPA """ text = unicodedata.normalize('NFC', text) acc = [] for chunk in self.chunk_re.findall(text): if self.letter_re.match(chunk): acc.append(self.english_g2p(chunk)) else: acc.append(chunk) text = ''.join(acc) text = self.puncnorm.norm(text) if normpunc else text text = ligaturize(text) if ligatures else text return text def strict_trans(self, text, normpunc=False, ligatures=False): return self.transliterate(text, normpunc, ligatures) def word_to_tuples(self, word, normpunc=False): """Given a word, returns a list of tuples corresponding to IPA segments. Args: word (unicode): word to transliterate normpunc (bool): If True, normalizes punctuation to ASCII inventory Returns: list: A list of (category, lettercase, orthographic_form, phonetic_form, feature_vectors) tuples. The "feature vectors" form a list consisting of (segment, vector) pairs. For IPA segments, segment is a substring of phonetic_form such that the concatenation of all segments in the list is equal to the phonetic_form. The vectors are a sequence of integers drawn from the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to '+'. """ def cat_and_cap(c): cat, case = tuple(unicodedata.category(c)) case = 1 if case == 'u' else 0 return unicode(cat), case def recode_ft(ft): try: return {'+': 1, '0': 0, '-': -1}[ft] except KeyError: return None def vec2bin(vec): return map(recode_ft, vec) def to_vector(seg): return seg, vec2bin(self.ft.segment_to_vector(seg)) def to_vectors(phon): if phon == '': return [(-1, [0] * self.num_panphon_fts)] else: return [to_vector(seg) for seg in self.ft.segs(phon)] tuples = [] word = unicode(word) # word = self.strip_diacritics.process(word) word = unicodedata.normalize('NFKD', word) word = unicodedata.normalize('NFC', word) while word: match = re.match('[A-Za-z]+', word) if match: span = match.group(0) cat, case = cat_and_cap(span[0]) phonword = self.transliterate(span) phonsegs = self.ft.segs(phonword) maxlen = max(len(phonsegs), len(span)) orth = list(span) + [''] * (maxlen - len(span)) phonsegs += [''] * (maxlen - len(phonsegs)) for p, o in zip(phonsegs, orth): tuples.append(('L', case, o, p, to_vectors(p))) word = word[len(span):] else: span = word[0] span = self.puncnorm.norm(span) if normpunc else span cat, case = cat_and_cap(span) cat = 'P' if normpunc and cat in self.puncnorm else cat phon = '' vecs = to_vectors(phon) tuples.append((cat, case, span, phon, vecs)) word = word[1:] return tuples
class SimpleEpitran(object): def __init__(self, code, preproc=True, postproc=True, ligatures=False, rev=False, rev_preproc=True, rev_postproc=True): """Constructs the backend object epitran uses for most languages Args: code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen preproc (bool): if True, apply preprocessor postproc (bool): if True, apply postprocessors ligatures (bool): if True, use phonetic ligatures for affricates instead of standard IPA rev (bool): if True, load reverse transliteration rev_preproc (bool): if True, apply preprocessor when reverse transliterating rev_postproc (bool): if True, apply postprocessor when reverse transliterating """ self.rev = rev self.g2p = self._load_g2p_map(code, False) self.regexp = self._construct_regex(self.g2p.keys()) self.puncnorm = PuncNorm() self.ft = panphon.FeatureTable() self.num_panphon_fts = len(self.ft.names) self.preprocessor = PrePostProcessor(code, 'pre', False) self.postprocessor = PrePostProcessor(code, 'post', False) self.strip_diacritics = StripDiacritics(code) self.preproc = preproc self.postproc = postproc self.ligatures = ligatures self.rev_preproc = rev_preproc self.rev_postproc = rev_postproc if rev: self.rev_g2p = self._load_g2p_map(code, True) self.rev_regexp = self._construct_regex(self.rev_g2p.keys()) self.rev_preprocessor = PrePostProcessor(code, 'pre', True) self.rev_postprocessor = PrePostProcessor(code, 'post', True) self.nils = defaultdict(int) def __enter__(self): return self def __exit__(self, type_, val, tb): for nil, count in self.nils.items(): sys.stderr.write('Unknown character "{}" occured {} times.\n'.format(nil, count)) def _one_to_many_gr_by_line_map(self, gr_by_line): for g, ls in gr_by_line.items(): if len(ls) != 1: return (g, ls) return None def _load_g2p_map(self, code, rev): """Load the code table for the specified language. Args: code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the language/script to be loaded rev (boolean): True for reversing the table (for reverse transliterating) """ g2p = defaultdict(list) gr_by_line = defaultdict(list) code += '_rev' if rev else '' try: path = os.path.join('data', 'map', code + '.csv') path = pkg_resources.resource_filename(__name__, path) except IndexError: raise DatafileError('Add an appropriately-named mapping to the data/maps directory.') with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8') next(reader) for (i, fields) in enumerate(reader): try: graph, phon = fields except ValueError: raise DatafileError('Map file is not well formed at line {}.'.format(i + 2)) graph = unicodedata.normalize('NFC', graph) phon = unicodedata.normalize('NFC', phon) g2p[graph].append(phon) gr_by_line[graph].append(i) if self._one_to_many_gr_by_line_map(g2p): graph, lines = self._one_to_many_gr_by_line_map(gr_by_line) lines = [l + 2 for l in lines] raise MappingError('One-to-many G2P mapping for "{}" on lines {}'.format(graph, ', '.join(map(str, lines))).encode('utf-8')) return g2p def _load_punc_norm_map(self): """Load the map table for normalizing 'down' punctuation.""" path = os.path.join('data', 'puncnorm.csv') path = pkg_resources.resource_filename(__name__, path) with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8', delimiter=str(','), quotechar=str('"')) next(reader) return {punc: norm for (punc, norm) in reader} def _construct_regex(self, g2p_keys): """Build a regular expression that will greadily match segments from the mapping table. """ graphemes = sorted(g2p_keys, key=len, reverse=True) return re.compile(r'({})'.format(r'|'.join(graphemes)), re.I) def general_trans(self, text, filter_func, normpunc=False, ligatures=False): """Transliaterates a word into IPA, filtering with filter_func Args: text (str): word to transcribe; unicode strings filter_func (function): function for filtering segments; takes a <segment, is_ipa> tuple and returns a boolean. normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string, filtered by filter_func. """ text = unicode(text) text = self.strip_diacritics.process(text) text = unicodedata.normalize('NFC', text.lower()) if self.preproc: text = self.preprocessor.process(text) tr_list = [] while text: m = self.regexp.match(text) if m: source = m.group(0) try: target = self.g2p[source][0] except KeyError: logging.debug("source = '{}'".format(source)) logging.debug("self.g2p[source] = '{}'" .format(self.g2p[source])) target = source tr_list.append((target, True)) text = text[len(source):] else: tr_list.append((text[0], False)) self.nils[text[0]] += 2 text = text[1:] text = ''.join([s for (s, _) in filter(filter_func, tr_list)]) if self.postproc: text = self.postprocessor.process(text) if ligatures or self.ligatures: text = ligaturize(text) if normpunc: text = self.puncnorm.norm(text) return text def transliterate(self, text, normpunc=False, ligatures=False): """Transliterates/transcribes a word into IPA Passes unmapped characters through to output unchanged. Args: word (str): word to transcribe; unicode string normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string with unrecognized characters included """ return self.general_trans(text, lambda x: True, normpunc, ligatures) def general_reverse_trans(self, ipa): """Reconstructs word from IPA. Does the reverse of transliterate(). Ignores unmapped characters. Args: ipa (str): word transcription in ipa; unicode string Returns: unicode: reconstructed word """ text = unicode(ipa) if self.rev_preproc: text = self.rev_preprocessor.process(text) tr_list = [] while text: m = self.rev_regexp.match(text) if m: source = m.group(0) try: target = self.rev_g2p[source][0] except KeyError: logging.debug("source = '{}'".format(source)) logging.debug("self.rev_g2p[source] = '{}'" .format(self.g2p[source])) target = source tr_list.append((target, True)) text = text[len(source):] else: tr_list.append((text[0], False)) self.nils[text[0]] += 2 text = text[1:] text = ''.join([s for (s, _) in tr_list]) if self.rev_postproc: text = self.rev_postprocessor.process(text) return text def reverse_transliterate(self, ipa): """Reconstructs word from IPA. Does the reverse of transliterate() Args: ipa (str): word transcription in ipa; unicode string Returns: unicode: reconstructed word """ if not self.rev: raise ValueError('This Epitran object was initialized with no reverse transliteration loaded') return self.general_reverse_trans(ipa) def strict_trans(self, text, normpunc=False, ligatures=False): """Transliterates/transcribes a word into IPA Ignores umapped characters. Args: word (str): word to transcribe; unicode string normpunc (bool): normalize punctuation ligatures (bool): use precomposed ligatures instead of standard IPA Returns: unicode: IPA string """ return self.general_trans(text, lambda x: x[1], normpunc, ligatures) def word_to_tuples(self, word, normpunc=False): """Given a word, returns a list of tuples corresponding to IPA segments. Args: word (unicode): word to transliterate normpunc (bool): If True, normalizes punctuation to ASCII inventory Returns: list: A list of (category, lettercase, orthographic_form, phonetic_form, feature_vectors) tuples. The "feature vectors" form a list consisting of (segment, vector) pairs. For IPA segments, segment is a substring of phonetic_form such that the concatenation of all segments in the list is equal to the phonetic_form. The vectors are a sequence of integers drawn from the set {-1, 0, 1} where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to '+'. """ def cat_and_cap(c): cat, case = tuple(unicodedata.category(c)) case = 1 if case == 'u' else 0 return unicode(cat), case def recode_ft(ft): try: return {'+': 1, '0': 0, '-': -1}[ft] except KeyError: return None def vec2bin(vec): return list(map(recode_ft, vec)) def to_vector(seg): return seg, vec2bin(self.ft.segment_to_vector(seg)) def to_vectors(phon): if phon == '': return [(-1, [0] * self.num_panphon_fts)] else: return [to_vector(seg) for seg in self.ft.ipa_segs(phon)] tuples = [] word = unicode(word) word = self.strip_diacritics.process(word) word = unicodedata.normalize('NFC', word) if self.preproc: word = self.preprocessor.process(word) while word: match = self.regexp.match(word) if match: span = match.group(1) cat, case = cat_and_cap(span[0]) phon = self.g2p[span.lower()][0] vecs = to_vectors(phon) tuples.append(('L', case, span, phon, vecs)) word = word[len(span):] else: span = word[0] span = self.puncnorm.norm(span) if normpunc else span cat, case = cat_and_cap(span) cat = 'P' if normpunc and cat in self.puncnorm else cat phon = '' vecs = to_vectors(phon) tuples.append((cat, case, span, phon, vecs)) word = word[1:] return tuples def ipa_segs(self, ipa): """Given an IPA string, decompose it into a list of segments Args: ipa (unicode): a Unicode IPA string Returns: list: a list of unicode strings corresponding to segments (consonants and vowels) in the input string """ return self.ft.segs(ipa)