def test_ts(): with pytest.raises(AssertionError): TranscriptionSystem('') with pytest.raises(ValueError): TranscriptionSystem('_f1') with pytest.raises(ValueError): TranscriptionSystem('_f2') with pytest.raises(ValueError): TranscriptionSystem('_f3') with pytest.raises(ValueError): _ = TranscriptionSystem('what')
def __init__(self, id_): if not hasattr(self, 'data'): # Only initialize, if this is really a new instance! assert id_ in SOUNDCLASS_SYSTEMS data, self.sounds, self.names = read_data('soundclasses', 'lingpy.tsv', id_) self.data = {} self.classes = set() for k, v in data.items(): self.data[k] = v[0] self.classes.add(v[0]['grapheme']) self.system = TranscriptionSystem('bipa')
def _make_package(args): # pragma: no cover """Prepare transcriptiondata from the transcription sources.""" from lingpy.sequence.sound_classes import token2class from lingpy.data import Model columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE'] bipa = TranscriptionSystem('bipa') for src, rows in args.repos.iter_sources(type='td'): args.log.info('TranscriptionData {0} ...'.format(src['NAME'])) uritemplate = URITemplate( src['URITEMPLATE']) if src['URITEMPLATE'] else None out = [[ 'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME', 'URL' ] + columns] graphemes = set() for row in rows: if row['GRAPHEME'] in graphemes: args.log.warn('skipping duplicate grapheme: {0}'.format( row['GRAPHEME'])) continue graphemes.add(row['GRAPHEME']) if not row['BIPA']: bipa_sound = bipa[row['GRAPHEME']] explicit = '' else: bipa_sound = bipa[row['BIPA']] explicit = '+' generated = '+' if bipa_sound.generated else '' if is_valid_sound(bipa_sound, bipa): bipa_grapheme = bipa_sound.s bipa_name = bipa_sound.name else: bipa_grapheme, bipa_name = '<NA>', '<NA>' url = uritemplate.expand( **row) if uritemplate else row.get('URL', '') out.append([ bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'], url ] + [row.get(c, '') for c in columns]) found = len([o for o in out if o[0] != '<NA>']) args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format( found, len(out), found / len(out) * 100)) with UnicodeWriter(pkg_path('transcriptiondata', '{0}.tsv'.format(src['NAME'])), delimiter='\t') as writer: writer.writerows(out) count = 0 with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'), delimiter='\t') as writer: writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS) for grapheme, sound in sorted(bipa.sounds.items()): if not sound.alias: writer.writerow([sound.name, grapheme] + [ token2class(grapheme, Model(cls)) for cls in SOUNDCLASS_SYSTEMS ]) count += 1 args.log.info('SoundClasses: {0} written to file.'.format(count))
def __init__(self, id_): if not hasattr(self, 'data'): # Only initialize, if this is really a new instance! self.data, self.sounds, self.names = read_data( 'transcriptiondata', id_ + '.tsv', 'GRAPHEME', 'URL', 'BIPA_GRAPHEME', 'GENERATED', 'URL', 'LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE', 'EXPLICIT') self.system = TranscriptionSystem('bipa')
def _make_app_data(args, test=False): tts = TranscriptionSystem('bipa') def sound_to_dict(snd): res = {'name': snd.name, 'bipa': snd.s, 'type': snd.type} for f in snd._name_order: res[f] = getattr(snd, f) return res # retrieve all sounds in the datasets all_sounds = {} for td in args.repos.iter_transcriptiondata(): for sound in td.data: if ' ' in sound: snd = tts[sound] glyph = snd.s assert '<?>' not in snd.s if snd.s not in all_sounds: all_sounds[glyph] = sound_to_dict(snd) for item in td.data[sound]: if item['grapheme'] not in all_sounds: all_sounds[item['grapheme']] = all_sounds[glyph] all_sounds[glyph][td.id] = td.data[sound] if test: break # add sounds from transcription system for sound in tts: if sound not in all_sounds: snd = tts[sound] if snd.type != 'marker': if snd.s in all_sounds: all_sounds[sound] = all_sounds[snd.s] else: all_sounds[sound] = sound_to_dict(snd) args.log.info('{0} unique graphemes loaded'.format(len(all_sounds))) for i, sc in enumerate(args.repos.iter_soundclass()): for sound in all_sounds: try: all_sounds[sound][sc.id] = [dict(grapheme=sc[sound])] except KeyError: # pragma: no cover pass if i == 0: if hasattr(sound, 's'): all_sounds[sound]['bipa'] = tts[sound].s if test: break datafile = args.repos.app_path('data.js') with datafile.open('w', encoding='utf8') as handler: handler.write('var BIPA = ' + json.dumps(all_sounds, indent=2) + ';\n') handler.write('var normalize = ' + json.dumps(tts._normalize) + ';\n') args.log.info('{0} written'.format(datafile))
def features(args): bipa = TranscriptionSystem(args.system) features = set() for sound in bipa.sounds.values(): if sound.type not in ['marker', 'unknownsound']: for k, v in sound.featuredict.items(): features.add((sound.type, k, v or '')) table = Table('TYPE', 'FEATURE', 'VALUE') table.extend(sorted(features)) print(table.render(tablefmt='simple'))
def sounds(args): tts = TranscriptionSystem(args.system) data = [] for sound in args.args: sound = tts.get( sound if isinstance(sound, text_type) else sound.decode('utf8')) if sound.type != 'unknownsound': data += [[ text_type(sound), sound.source or ' ', '1' if sound.generated else ' ', sound.grapheme if sound.alias else ' ', sound.name ]] else: data += [['?', sound.source, '?', '?', '?']] tbl = Table(args.system.upper(), 'SOURCE', 'GENERATED', 'ALIAS', 'NAME', rows=data) print(tbl.render(tablefmt=args.format, condensed=False))
def dstats(args): table = [['id', 'valid', 'total', 'percent']] bipa = TranscriptionSystem('bipa') for td in args.repos.iter_transcriptiondata(): ln = [ 1 if is_valid_sound(bipa[name], bipa) else 0 for name in td.names ] table += [[td.id, sum(ln), len(ln), sum(ln) / len(ln)]] table += [[ len(table) - 1, '', '', sum([line[-1] for line in table[1:]]) / (len(table) - 1) ]] print(tabulate.tabulate(table, headers='firstrow'))
def table(args): tts = TranscriptionSystem(args.system) tts_sounds = [ tts.get( sound if isinstance(sound, text_type) else sound.decode('utf8')) for sound in args.args ] if args.filter == 'generated': tts_sounds = [s for s in tts_sounds if s.generated] elif args.filter == 'unknown': tts_sounds = [s for s in tts_sounds if s.type == 'unknownsound'] elif args.filter == 'known': tts_sounds = [ s for s in tts_sounds if not s.generated and not s.type == 'unknownsound' ] data = defaultdict(list) ucount = 0 for sound in tts_sounds: if sound.type != 'unknownsound': data[sound.type] += [sound.table] else: ucount += 1 data['unknownsound'].append( [text_type(ucount), sound.source or '', sound.grapheme]) for cls in tts.sound_classes: if cls in data: print('# {0}\n'.format(cls)) tbl = Table(*[c.upper() for c in tts.columns[cls]], rows=data[cls]) print(tbl.render(tablefmt=args.format, condensed=False)) print('') if data['unknownsound']: print('# Unknown sounds\n') tbl = Table('NUMBER', 'SOURCE', 'GRAPHEME', rows=data['unknownsound']) print(tbl.render(tablefmt=args.format, condensed=False))
class SoundClasses(TranscriptionBase): """ Class for handling sound class models. """ def __init__(self, id_): if not hasattr(self, 'data'): # Only initialize, if this is really a new instance! assert id_ in SOUNDCLASS_SYSTEMS data, self.sounds, self.names = read_data('soundclasses', 'lingpy.tsv', id_) self.data = {} self.classes = set() for k, v in data.items(): self.data[k] = v[0] self.classes.add(v[0]['grapheme']) self.system = TranscriptionSystem('bipa') def resolve_sound(self, sound): """Function tries to identify a sound in the data. Notes ----- The function tries to resolve sounds to take a sound with less complex features in order to yield the next approximate sound class, if the transcription data are sound classes. """ sound = sound if isinstance(sound, Sound) else self.system[sound] if sound.name in self.data: return self.data[sound.name]['grapheme'] if not sound.type == 'unknownsound': if sound.type in ['diphthong', 'cluster']: return self.resolve_sound(sound.from_sound) name = [ s for s in sound.name.split(' ') if self.system._feature_values.get(s, '') not in ['laminality', 'ejection', 'tone'] ] while len(name) >= 4: sound = self.system.get(' '.join(name)) if sound and sound.name in self.data: return self.resolve_sound(sound) name.pop(0) raise KeyError(":sc:resolve_sound: No sound could be found.")
from lingpy import * import json from bxs import sampa from unicodedata import normalize from pyclts.transcriptionsystem import TranscriptionSystem from lingpy.data.ipa.sampa import xs bipa = TranscriptionSystem('bipa') data = csv2list('graphemes.tsv') prof = [['Grapheme', 'IPA', 'BIPA', 'CLTS_Name']] visited = set() mapper = {} for k, v in sampa.items(): if 'U+' in v['ipa']: v['ipa'] = chr(int('0x' + v['ipa'][2:], 0)) mapper[normalize('NFD', v['ipa'])] = v['grapheme'] mapper[normalize('NFC', v['ipa'])] = v['grapheme'] sound = bipa[v['ipa']] if not sound.type in ['unknownsound', 'marker']: mapper[sound.s] = v['grapheme'] if v['grapheme'] not in visited: prof += [[ v['grapheme'], v['ipa'], sound.s, sound.name.replace(' ', '_') ]] visited.add(v['grapheme'])
def dump(args, test=False): sounds = defaultdict(dict) data = [] bipa = TranscriptionSystem('bipa') # start from assembling bipa-sounds for grapheme, sound in sorted(bipa.sounds.items(), key=lambda p: p[1].alias if p[1].alias else False): if sound.type not in ['marker']: if sound.alias: assert sound.name in sounds sounds[sound.name]['aliases'].add(grapheme) else: assert sound.name not in sounds sounds[sound.name] = { 'grapheme': grapheme, 'unicode': sound.uname or '', 'generated': '', 'note': sound.note or '', 'type': sound.type, 'aliases': set(), 'normalized': '+' if sound.normalized else '' } data.append( Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '', '', '', sound.note or '')) # add sounds systematically by their alias for td in args.repos.iter_transcriptiondata(): for name in td.names: bipa_sound = bipa[name] # check for consistency of mapping here if not is_valid_sound(bipa_sound, bipa): continue sound = sounds.get(name) if not sound: sound = sounds[name] = { 'grapheme': bipa_sound.s, 'aliases': {bipa_sound.s}, 'generated': '+', 'unicode': bipa_sound.uname or '', 'note': '', 'type': bipa_sound.type, 'alias': '+' if bipa_sound.alias else '', 'normalized': '+' if bipa_sound.normalized else '' } for item in td.data[name]: sound['aliases'].add(item['grapheme']) # add the values here data.append( Grapheme( item['grapheme'], name, item['explicit'], '', # sounds[name]['alias'], td.id, item.get('frequency', ''), item.get('url', ''), item.get('features', ''), item.get('image', ''), item.get('sound', ''), )) if test: break # sound classes have a generative component, so we need to treat them # separately for sc in args.repos.iter_soundclass(): for name in sounds: try: grapheme = sc[name] data.append( Grapheme( grapheme, name, '+' if name in sc.data else '', '', sc.id, )) except KeyError: # pragma: no cover args.log.debug(name, sounds[name]['grapheme']) if test: break # last run, check again for each of the remaining transcription systems, # whether we can translate the sound for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']): for name in sounds: try: ts_sound = ts[name] if is_valid_sound(ts_sound, ts): sounds[name]['aliases'].add(ts_sound.s) data.append( Grapheme( ts_sound.s, name, '' if sounds[name]['generated'] else '+', '', # sounds[name]['alias'], ts.id, )) except ValueError: pass except TypeError: args.log.debug('{0}: {1}'.format(ts.id, name)) if test: break with UnicodeWriter(args.repos.data_path('sounds.tsv'), delimiter='\t') as writer: writer.writerow( ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE']) for k, v in sorted(sounds.items(), reverse=True): writer.writerow([ k, v['type'], v['grapheme'], v['unicode'], v['generated'], v['note'] ]) with UnicodeWriter(args.repos.data_path('graphemes.tsv'), delimiter='\t') as writer: writer.writerow([f.name for f in attr.fields(Grapheme)]) for row in data: writer.writerow(attr.astuple(row))