def __str__(self): """ Return the reference representation of the sound. Note ---- We first try to return the non-alias value in our data. If this fails, we create the sound based on it's feature representation. """ # generated sounds need to be re-produced for double-checking if not self.generated: if not self.alias and self.grapheme in self.ts.sounds: return self.grapheme elif self.alias and self.featureset in self.ts.features: return str(self.ts.features[self.featureset]) # this can usually not happen, as we catch these errors when loading a ts! raise ValueError('Orphaned alias {0}'.format( self.grapheme)) # pragma: no cover # search for best base-string elements = [f for f in self._features() if f not in EXCLUDE_FEATURES ] + [self.type] base_str = self.base or '<?>' base_graphemes = [] while elements: base = self.ts.features.get(frozenset(elements)) if base: base_graphemes.append(base.grapheme) elements.pop(0) base_str = base_graphemes[-1] if base_graphemes else base_str or '<?>' base_vals = { self.ts._feature_values[elm] for elm in self.ts.sounds[base_str].name.split(' ')[:-1]} if \ base_str != '<?>' else {} out = [] for p in self._write_order['pre']: if p not in base_vals and getattr(self, p, '') in self._features(): out.append( norm(self.ts.features[self.type].get( getattr(self, p, ''), '<!>'))) out.append(base_str) for p in self._write_order['post']: if p not in base_vals and getattr(self, p, '') in self._features(): out.append( norm(self.ts.features[self.type].get( getattr(self, p, ''), '<!>'))) return ''.join(out)
def _norm(self, string): """Extended normalization: normalize by list of norm-characers, split by character "/".""" nstring = norm(string) if "/" in string: s, t = string.split('/') nstring = t return self.normalize(nstring)
def __init__(self, id_): """ :param system: The name of a transcription system or a directory containing one. """ if hasattr(self, 'features'): # Only initialize, if this is really a new instance! return assert id_ system = pkg_path('transcriptionsystems', id_) if not (system.exists() and system.is_dir()): raise ValueError('unknown system: {0}'.format(id_)) self.system = TableGroup.from_file( pkg_path('transcriptionsystems', 'transcription-system-metadata.json')) self.system._fname = system / 'metadata.json' self.features = {'consonant': {}, 'vowel': {}, 'tone': {}} # dictionary for feature values, checks when writing elements from # write_order to make sure no output is doubled self._feature_values = {} # load the general features features = jsonlib.load( pkg_path('transcriptionsystems', 'features.json')) self.diacritics = dict(consonant={}, vowel={}, click={}, diphthong={}, tone={}, cluster={}) for dia in itertable(self.system.tabledict['diacritics.tsv']): if not dia['alias'] and not dia['typography']: self.features[dia['type']][dia['value']] = dia['grapheme'] # assign feature values to the dictionary self._feature_values[dia['value']] = dia['feature'] self.diacritics[dia['type']][dia['grapheme']] = dia['value'] self.sound_classes = {} self.columns = {} # the basic column structure, to allow for rendering self.sounds = {} # Sounds by grapheme self._covered = {} # check for unresolved aliased sounds aliases = [] for cls in [Consonant, Vowel, Tone, Marker]: # noqa: F405 type_ = cls.__name__.lower() self.sound_classes[type_] = cls # store information on column structure to allow for rendering of a # sound in this form, which will make it easier to insert it when # finding generated sounds self.columns[type_] = [ c['name'].lower() for c in self.system.tabledict['{0}s.tsv'.format( type_)].asdict()['tableSchema']['columns'] ] for l, item in enumerate( itertable( self.system.tabledict['{0}s.tsv'.format(type_)])): if item['grapheme'] in self.sounds: raise ValueError( 'duplicate grapheme in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, item['grapheme'])) sound = cls(ts=self, **item) # make sure this does not take too long for key, value in item.items(): if key not in {'grapheme', 'note', 'alias'} and \ value and value not in self._feature_values: self._feature_values[value] = key if type_ != 'marker' and value not in features[type_][ key]: raise ValueError( "Unrecognized features ({0}: {1}, line {2}))". format(key, value, l + 2)) self.sounds[item['grapheme']] = sound if not sound.alias: if sound.featureset in self.features: raise ValueError( 'duplicate features in {0}:{1}: {2}'.format( type_ + 's.tsv', l + 2, sound.name)) self.features[sound.featureset] = sound else: aliases += [(l, sound.type, sound.featureset)] # check for consistency of aliases: if an alias has no counterpart, it # is orphaned and needs to be deleted or given an accepted non-aliased # sound if [x for x in aliases if x[2] not in self.features]: # pragma: no cover error = ', '.join( text_type(x[0] + 2) + '/' + text_type(x[1]) for x in aliases if x[2] not in self.features) raise ValueError('Orphaned aliases in line(s) {0}'.format(error)) # basic regular expression, used to match the basic sounds in the system. self._regex = None self._update_regex() # normalization data self._normalize = { norm(r['source']): norm(r['target']) for r in itertable(self.system.tabledict['normalize.tsv']) }