def _make_package(args): # pragma: no cover """Prepare transcriptiondata from the transcription sources.""" from lingpy.sequence.sound_classes import token2class from lingpy.data import Model columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE'] bipa = TranscriptionSystem('bipa') for src, rows in args.repos.iter_sources(type='td'): args.log.info('TranscriptionData {0} ...'.format(src['NAME'])) uritemplate = URITemplate( src['URITEMPLATE']) if src['URITEMPLATE'] else None out = [[ 'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME', 'URL' ] + columns] graphemes = set() for row in rows: if row['GRAPHEME'] in graphemes: args.log.warn('skipping duplicate grapheme: {0}'.format( row['GRAPHEME'])) continue graphemes.add(row['GRAPHEME']) if not row['BIPA']: bipa_sound = bipa[row['GRAPHEME']] explicit = '' else: bipa_sound = bipa[row['BIPA']] explicit = '+' generated = '+' if bipa_sound.generated else '' if is_valid_sound(bipa_sound, bipa): bipa_grapheme = bipa_sound.s bipa_name = bipa_sound.name else: bipa_grapheme, bipa_name = '<NA>', '<NA>' url = uritemplate.expand( **row) if uritemplate else row.get('URL', '') out.append([ bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'], url ] + [row.get(c, '') for c in columns]) found = len([o for o in out if o[0] != '<NA>']) args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format( found, len(out), found / len(out) * 100)) with UnicodeWriter(pkg_path('transcriptiondata', '{0}.tsv'.format(src['NAME'])), delimiter='\t') as writer: writer.writerows(out) count = 0 with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'), delimiter='\t') as writer: writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS) for grapheme, sound in sorted(bipa.sounds.items()): if not sound.alias: writer.writerow([sound.name, grapheme] + [ token2class(grapheme, Model(cls)) for cls in SOUNDCLASS_SYSTEMS ]) count += 1 args.log.info('SoundClasses: {0} written to file.'.format(count))
def dstats(args): table = [['id', 'valid', 'total', 'percent']] bipa = TranscriptionSystem('bipa') for td in args.repos.iter_transcriptiondata(): ln = [ 1 if is_valid_sound(bipa[name], bipa) else 0 for name in td.names ] table += [[td.id, sum(ln), len(ln), sum(ln) / len(ln)]] table += [[ len(table) - 1, '', '', sum([line[-1] for line in table[1:]]) / (len(table) - 1) ]] print(tabulate.tabulate(table, headers='firstrow'))
def test_is_valid_sound(bipa): assert not is_valid_sound(bipa['_'], bipa) assert is_valid_sound(bipa['ä'], bipa)
def dump(args, test=False): sounds = defaultdict(dict) data = [] bipa = TranscriptionSystem('bipa') # start from assembling bipa-sounds for grapheme, sound in sorted(bipa.sounds.items(), key=lambda p: p[1].alias if p[1].alias else False): if sound.type not in ['marker']: if sound.alias: assert sound.name in sounds sounds[sound.name]['aliases'].add(grapheme) else: assert sound.name not in sounds sounds[sound.name] = { 'grapheme': grapheme, 'unicode': sound.uname or '', 'generated': '', 'note': sound.note or '', 'type': sound.type, 'aliases': set(), 'normalized': '+' if sound.normalized else '' } data.append( Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '', '', '', sound.note or '')) # add sounds systematically by their alias for td in args.repos.iter_transcriptiondata(): for name in td.names: bipa_sound = bipa[name] # check for consistency of mapping here if not is_valid_sound(bipa_sound, bipa): continue sound = sounds.get(name) if not sound: sound = sounds[name] = { 'grapheme': bipa_sound.s, 'aliases': {bipa_sound.s}, 'generated': '+', 'unicode': bipa_sound.uname or '', 'note': '', 'type': bipa_sound.type, 'alias': '+' if bipa_sound.alias else '', 'normalized': '+' if bipa_sound.normalized else '' } for item in td.data[name]: sound['aliases'].add(item['grapheme']) # add the values here data.append( Grapheme( item['grapheme'], name, item['explicit'], '', # sounds[name]['alias'], td.id, item.get('frequency', ''), item.get('url', ''), item.get('features', ''), item.get('image', ''), item.get('sound', ''), )) if test: break # sound classes have a generative component, so we need to treat them # separately for sc in args.repos.iter_soundclass(): for name in sounds: try: grapheme = sc[name] data.append( Grapheme( grapheme, name, '+' if name in sc.data else '', '', sc.id, )) except KeyError: # pragma: no cover args.log.debug(name, sounds[name]['grapheme']) if test: break # last run, check again for each of the remaining transcription systems, # whether we can translate the sound for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']): for name in sounds: try: ts_sound = ts[name] if is_valid_sound(ts_sound, ts): sounds[name]['aliases'].add(ts_sound.s) data.append( Grapheme( ts_sound.s, name, '' if sounds[name]['generated'] else '+', '', # sounds[name]['alias'], ts.id, )) except ValueError: pass except TypeError: args.log.debug('{0}: {1}'.format(ts.id, name)) if test: break with UnicodeWriter(args.repos.data_path('sounds.tsv'), delimiter='\t') as writer: writer.writerow( ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE']) for k, v in sorted(sounds.items(), reverse=True): writer.writerow([ k, v['type'], v['grapheme'], v['unicode'], v['generated'], v['note'] ]) with UnicodeWriter(args.repos.data_path('graphemes.tsv'), delimiter='\t') as writer: writer.writerow([f.name for f in attr.fields(Grapheme)]) for row in data: writer.writerow(attr.astuple(row))