Beispiel #1
0
def _make_package(args):  # pragma: no cover
    """Prepare transcriptiondata from the transcription sources."""
    from lingpy.sequence.sound_classes import token2class
    from lingpy.data import Model

    columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE']
    bipa = TranscriptionSystem('bipa')
    for src, rows in args.repos.iter_sources(type='td'):
        args.log.info('TranscriptionData {0} ...'.format(src['NAME']))
        uritemplate = URITemplate(
            src['URITEMPLATE']) if src['URITEMPLATE'] else None
        out = [[
            'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME',
            'URL'
        ] + columns]
        graphemes = set()
        for row in rows:
            if row['GRAPHEME'] in graphemes:
                args.log.warn('skipping duplicate grapheme: {0}'.format(
                    row['GRAPHEME']))
                continue
            graphemes.add(row['GRAPHEME'])
            if not row['BIPA']:
                bipa_sound = bipa[row['GRAPHEME']]
                explicit = ''
            else:
                bipa_sound = bipa[row['BIPA']]
                explicit = '+'
            generated = '+' if bipa_sound.generated else ''
            if is_valid_sound(bipa_sound, bipa):
                bipa_grapheme = bipa_sound.s
                bipa_name = bipa_sound.name
            else:
                bipa_grapheme, bipa_name = '<NA>', '<NA>'
            url = uritemplate.expand(
                **row) if uritemplate else row.get('URL', '')
            out.append([
                bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'],
                url
            ] + [row.get(c, '') for c in columns])
        found = len([o for o in out if o[0] != '<NA>'])
        args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format(
            found, len(out), found / len(out) * 100))
        with UnicodeWriter(pkg_path('transcriptiondata',
                                    '{0}.tsv'.format(src['NAME'])),
                           delimiter='\t') as writer:
            writer.writerows(out)

    count = 0
    with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS)
        for grapheme, sound in sorted(bipa.sounds.items()):
            if not sound.alias:
                writer.writerow([sound.name, grapheme] + [
                    token2class(grapheme, Model(cls))
                    for cls in SOUNDCLASS_SYSTEMS
                ])
                count += 1
    args.log.info('SoundClasses: {0} written to file.'.format(count))
Beispiel #2
0
def dstats(args):
    table = [['id', 'valid', 'total', 'percent']]
    bipa = TranscriptionSystem('bipa')
    for td in args.repos.iter_transcriptiondata():
        ln = [
            1 if is_valid_sound(bipa[name], bipa) else 0 for name in td.names
        ]
        table += [[td.id, sum(ln), len(ln), sum(ln) / len(ln)]]
    table += [[
        len(table) - 1, '', '',
        sum([line[-1] for line in table[1:]]) / (len(table) - 1)
    ]]
    print(tabulate.tabulate(table, headers='firstrow'))
Beispiel #3
0
def test_is_valid_sound(bipa):
    assert not is_valid_sound(bipa['_'], bipa)
    assert is_valid_sound(bipa['ä'], bipa)
Beispiel #4
0
def dump(args, test=False):
    sounds = defaultdict(dict)
    data = []
    bipa = TranscriptionSystem('bipa')
    # start from assembling bipa-sounds
    for grapheme, sound in sorted(bipa.sounds.items(),
                                  key=lambda p: p[1].alias
                                  if p[1].alias else False):
        if sound.type not in ['marker']:
            if sound.alias:
                assert sound.name in sounds
                sounds[sound.name]['aliases'].add(grapheme)
            else:
                assert sound.name not in sounds
                sounds[sound.name] = {
                    'grapheme': grapheme,
                    'unicode': sound.uname or '',
                    'generated': '',
                    'note': sound.note or '',
                    'type': sound.type,
                    'aliases': set(),
                    'normalized': '+' if sound.normalized else ''
                }
            data.append(
                Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '',
                         '', '', sound.note or ''))

    # add sounds systematically by their alias
    for td in args.repos.iter_transcriptiondata():
        for name in td.names:
            bipa_sound = bipa[name]
            # check for consistency of mapping here
            if not is_valid_sound(bipa_sound, bipa):
                continue

            sound = sounds.get(name)
            if not sound:
                sound = sounds[name] = {
                    'grapheme': bipa_sound.s,
                    'aliases': {bipa_sound.s},
                    'generated': '+',
                    'unicode': bipa_sound.uname or '',
                    'note': '',
                    'type': bipa_sound.type,
                    'alias': '+' if bipa_sound.alias else '',
                    'normalized': '+' if bipa_sound.normalized else ''
                }

            for item in td.data[name]:
                sound['aliases'].add(item['grapheme'])
                # add the values here
                data.append(
                    Grapheme(
                        item['grapheme'],
                        name,
                        item['explicit'],
                        '',  # sounds[name]['alias'],
                        td.id,
                        item.get('frequency', ''),
                        item.get('url', ''),
                        item.get('features', ''),
                        item.get('image', ''),
                        item.get('sound', ''),
                    ))
        if test:
            break

    # sound classes have a generative component, so we need to treat them
    # separately
    for sc in args.repos.iter_soundclass():
        for name in sounds:
            try:
                grapheme = sc[name]
                data.append(
                    Grapheme(
                        grapheme,
                        name,
                        '+' if name in sc.data else '',
                        '',
                        sc.id,
                    ))
            except KeyError:  # pragma: no cover
                args.log.debug(name, sounds[name]['grapheme'])
        if test:
            break

    # last run, check again for each of the remaining transcription systems,
    # whether we can translate the sound
    for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']):
        for name in sounds:
            try:
                ts_sound = ts[name]
                if is_valid_sound(ts_sound, ts):
                    sounds[name]['aliases'].add(ts_sound.s)
                    data.append(
                        Grapheme(
                            ts_sound.s,
                            name,
                            '' if sounds[name]['generated'] else '+',
                            '',  # sounds[name]['alias'],
                            ts.id,
                        ))
            except ValueError:
                pass
            except TypeError:
                args.log.debug('{0}: {1}'.format(ts.id, name))
        if test:
            break

    with UnicodeWriter(args.repos.data_path('sounds.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(
            ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE'])
        for k, v in sorted(sounds.items(), reverse=True):
            writer.writerow([
                k, v['type'], v['grapheme'], v['unicode'], v['generated'],
                v['note']
            ])

    with UnicodeWriter(args.repos.data_path('graphemes.tsv'),
                       delimiter='\t') as writer:
        writer.writerow([f.name for f in attr.fields(Grapheme)])
        for row in data:
            writer.writerow(attr.astuple(row))