def _load_g2p_map(self, code): """Load the code table for the specified language. Args: code (str): ISO 639-3 code plus "-" plus ISO 15924 code for the language/script to be loaded """ g2p = defaultdict(list) gr_by_line = defaultdict(list) try: path = os.path.join('data', 'map', code + '.csv') path = pkg_resources.resource_filename(__name__, path) except IndexError: raise DatafileError('Add an appropriately-named mapping to the data/maps directory.') with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8') next(reader) for (i, fields) in enumerate(reader): try: graph, phon = fields except ValueError: raise DatafileError('Map file is not well formed at line {}.'.format(i + 2)) graph = unicodedata.normalize('NFC', graph) phon = unicodedata.normalize('NFC', phon) g2p[graph].append(phon) gr_by_line[graph].append(i) if self._one_to_many_gr_by_line_map(g2p): graph, lines = self._one_to_many_gr_by_line_map(gr_by_line) lines = [l + 2 for l in lines] raise MappingError('One-to-many G2P mapping for "{}" on lines {}'.format(graph, ', '.join(map(str, lines))).encode('utf-8')) return g2p
def _read_rule(self, i, line): line = line.strip() if line: line = unicodedata.normalize('NFC', unicodedata.normalize('NFD', line)) s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line) if s: self.symbols[s.group('symbol')] = s.group('value') else: line = self._sub_symbols(line) r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line) try: a, b, X, Y = r.groups() # 以 -> / _ 分为四部分 except AttributeError: raise DatafileError( 'Line {}: "{}" cannot be parsed.'.format(i + 1, line)) X, Y = X.replace('#', '^'), Y.replace('#', '$') a, b = a.replace('0', ''), b.replace('0', '') try: if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a): return self._fields_to_function_metathesis(a, X, Y) else: return self._fields_to_function(a, b, X, Y) except Exception as e: raise DatafileError( 'Line {}: "{}" cannot be compiled as regex: ̪{}'. format(i + 1, line, e))