Example #1
0
def report(analysis):
    segments = Table('Segment', 'Occurrence', 'BIPA', 'CLTS SoundClass')
    for a, b in sorted(
            analysis['stats']['segments'].items(), key=lambda x: (-x[1], x[0])):
        c, d = '✓', '✓'
        if a in analysis['stats']['sclass_errors']:
            c = '✓' if a not in analysis['stats']['bipa_errors'] else '?'
            d = ', '.join(analysis['stats']['sclass_errors'][a]) \
                if a not in analysis['stats']['sclass_errors'] else '?'

        # escape pipe for markdown table if necessary
        a = a.replace('|', '|')

        segments.append([a, b, c, d])

    invalid = Table('ID', 'LANGUAGE', 'CONCEPT', 'FORM')
    for row in analysis['stats']['invalid_words']:
        invalid.append(row)  # pragma: no cover

    words = Table('ID', 'LANGUAGE', 'CONCEPT', 'FORM', 'SEGMENTS')
    for row in analysis['stats']['bad_words']:
        words.append(row)
    return TEMPLATE.format(
        segments.render(verbose=True),
        invalid.render(verbose=True),
        words.render(verbose=True))
Example #2
0
def test_Table():
    from clldutils.markup import Table

    t = Table()
    assert t.render() == ''

    t = Table('a', 'b', rows=[[1, 2], [3, 4]])
    assert t.render() == \
        '| a | b |\n|----:|----:|\n| 1 | 2 |\n| 3 | 4 |'
    assert t.render(condensed=False) == \
        '|   a |   b |\n|----:|----:|\n|   1 |   2 |\n|   3 |   4 |'
    assert t.render(verbose=True) == \
        '| a | b |\n|----:|----:|\n| 1 | 2 |\n| 3 | 4 |\n\n(2 rows)\n\n'
    assert t.render(sortkey=itemgetter(1), reverse=True) == \
        '| a | b |\n|----:|----:|\n| 3 | 4 |\n| 1 | 2 |'
Example #3
0
def ls(args):
    """
    gelato ls [COLS]+

    column specification:
    - license
    - macroareas
    """
    table = Table('ID', 'Title')
    cols = [col for col in args.args if col in ['license', 'macroareas']]
    tl = 40
    if args.args:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)
    for d in data_path(repos=Path(args.gelato_repos)).iterdir():
        if is_dataset_dir(d):
            ds = Dataset(d)
            row = [d.name, ds.md['dc:title']]
            for col in cols:
                if col == 'license':
                    lic = licenses.find(ds.md.get('dc:license') or '')
                    row.append(lic.id if lic else ds.md.get('dc:license'))

            table.append(row)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False))
Example #4
0
def coverage(args):
    from pyconcepticon.api import Concepticon

    varieties = defaultdict(set)

    def _word_length(ds, **kw):
        ds.coverage(varieties)

    with_dataset(args, _word_length)

    print('varieties', len(varieties))

    c = Concepticon(args.concepticon_repos)
    res = Counter()

    for cl in c.conceptlists.values():
        try:
            concepts = set(
                int(cc.concepticon_id) for cc in cl.concepts.values()
                if cc.concepticon_id)
        except:
            continue
        for varid, meanings in varieties.items():
            if concepts.issubset(meanings):
                res.update([cl['ID']])

    t = Table('concept list', 'variety count')
    for p in res.most_common():
        t.append(list(p))
    print(t.render(tablefmt='simple', condensed=False))
Example #5
0
def download(bibfile, log):  # pragma: no cover
    with NamedTemporaryFile(delete=False) as fp:
        log.info('download bib from {0}'.format(URL))
        fp.write(urlopen(URL).read())

    bibtool(fp.name)
    stats = Counter()

    def fix_entry_type(entry):
        type_ = entry.type.lower()
        type_map = {
            'thesis': 'phdthesis',
            'mvreference': 'misc',
            'report': 'techreport'
        }
        entry.type = type_map.get(type_, type_)
        stats.update([entry.type])

    bibfile.update(fp.name, log=log)
    bibfile.visit(fix_entry_type)
    bibfile.check(log)
    res = Table('entry type', '#')
    res.extend(list(stats.most_common()))
    res.append(['TOTAL', sum(stats.values())])
    print('\n' + res.render(tablefmt='simple'))
Example #6
0
def readme_concepticondata(api, cls):
    """
    Returns a dictionary with concept set label as value and tuples of concept
    list identifier and concept label as values.
    """
    D, G = defaultdict(list), defaultdict(list)
    labels = Counter()

    for cl in cls:
        for concept in [c for c in cl.concepts.values() if c.concepticon_id]:
            D[concept.concepticon_gloss].append(
                (cl.id, concept.label))
            G[concept.label].append(
                (concept.concepticon_id, concept.concepticon_gloss, cl.id))
            labels.update([concept.label])

    txt = [
        """
# Concepticon Statistics
* concept sets (used): {0}
* concept lists: {1}
* concept labels: {2}
* concept labels (unique): {3}
* Ø concepts per list: {4:.2f}
* Ø concepts per concept set: {5:.2f}
* Ø unique concept labels per concept set: {6:.2f}

""".format(
            len(D),
            len(cls),
            sum(list(labels.values())),
            len(labels),
            sum(list(labels.values())) / len(cls),
            sum([len(v) for k, v in D.items()]) / len(D),
            sum([len(set([label for _, label in v])) for k, v in D.items()]) /
            len(D))
    ]

    for attr, key in [
        ('Diverse', lambda x: (len(set([label for _, label in x[1]])), x[0] or '')),
        ('Frequent', lambda x: (len(set([clist for clist, _ in x[1]])), x[0] or '18G18G')),
    ]:
        table = Table(
            'No.', 'concept set', 'distinct labels', 'concept lists', 'examples')
        for i, (k, v) in enumerate(
                sorted(D.items(), key=key, reverse=True)[:20]):
            table.append([
                i + 1,
                k,
                len(set([label for _, label in v])),
                len(set([clist for clist, _ in v])),
                ', '.join(sorted(set(
                    ['«{0}»'.format(
                        label.replace('*', '`*`')) for _, label in v])))
            ])
        txt.append('## Twenty Most {0} Concept Sets\n\n{1}\n'.format(
            attr, table.render()))

    readme(api.data_path(), txt)
    return D, G
Example #7
0
def ls(args):
    t = Table('id', 'name', 'type', 'variables', 'societies')
    for ds in args.repos.datasets:
        t.append(
            [ds.id, ds.name, ds.type,
             len(ds.variables),
             len(ds.societies)])
    print(t.render(condensed=False, verbose=True))
Example #8
0
def run(args):
    t = Table('id', 'dir', 'title')
    for ds in get_datasets(args):
        if args.modules:
            print(inspect.getfile(ds.__class__))
            continue
        t.append((ds.id, ds.dir, getattr(ds.metadata, 'title', '')))
    if not args.modules:
        print(t.render(tablefmt='simple'))
Example #9
0
def stats(args):
    """
    cldf stats <DATASET>

    Print basic stats for CLDF dataset <DATASET>, where <DATASET> may be the path to
    - a CLDF metadata file
    - a CLDF core data file
    """
    ds = _get_dataset(args)
    print(ds)
    md = Table('key', 'value')
    md.extend(ds.properties.items())
    print(md.render(condensed=False, tablefmt=None))
    print()
    t = Table('Path', 'Type', 'Rows')
    for p, type_, r in ds.stats():
        t.append([p, type_, r])
    print(t.render(condensed=False, tablefmt=None))
Example #10
0
def stats(args):
    """
    cldf stats <DATASET>

    Print basic stats for CLDF dataset <DATASET>, where <DATASET> may be the path to
    - a CLDF metadata file
    - a CLDF core data file
    """
    ds = _get_dataset(args)
    print(ds)
    md = Table('key', 'value')
    md.extend(ds.properties.items())
    print(md.render(condensed=False, tablefmt=None))
    print()
    t = Table('Path', 'Type', 'Rows')
    for p, type_, r in ds.stats():
        t.append([p, type_, r])
    print(t.render(condensed=False, tablefmt=None))
Example #11
0
def list_(args):
    """List datasets available for loading

    clics --lexibank-repos=PATH/TO/lexibank-data list
    """
    if args.unloaded:
        i = 0
        for i, ds in enumerate(iter_datasets()):
            print(ds.cldf_dir)
        if not i:
            print('No datasets installed')  # pragma: no cover
    else:
        table = Table('#', 'Dataset', 'Glosses', 'Concepticon', 'Varieties',
                      'Glottocodes', 'Families')
        try:
            concept_counts = {
                r[0]: r[1:]
                for r in args.api.db.fetchall('concepts_by_dataset')
            }
        except sqlite3.OperationalError:  # pragma: no cover
            print('No datasets loaded yet')
            return

        varieties = args.api.db.varieties
        var_counts = {}
        for dsid, vs in groupby(varieties, lambda v: v.source):
            vs = list(vs)
            var_counts[dsid] = (len(vs), len(set(v.glottocode for v in vs)),
                                len(set(v.family for v in vs)))

        for count, d in enumerate(args.api.db.datasets):
            table.append([
                count + 1,
                d.replace('lexibank-', ''),
                concept_counts[d][1],
                concept_counts[d][0],
                var_counts[d][0],
                var_counts[d][1],
                var_counts[d][2],
            ])
        table.append([
            '', 'TOTAL', 0,
            args.api.db.fetchone("""\
select
    count(distinct p.concepticon_id) from parametertable as p, formtable as f, languagetable as l
where
    f.parameter_id = p.id and f.dataset_id = p.dataset_id
    and f.language_id = l.id and f.dataset_id = l.dataset_id
    and l.glottocode is not null
    and l.family != 'Bookkeeping'
""")[0],
            len(varieties),
            len(set(v.glottocode for v in varieties)),
            len(set(v.family for v in varieties))
        ])
        print(table.render(tablefmt='simple'))
Example #12
0
def list_(args):
    """List datasets available for loading

    clics --lexibank-repos=PATH/TO/lexibank-data list
    """
    if args.unloaded:
        i = 0
        for i, ds in enumerate(iter_datasets()):
            print(ds.cldf_dir)
        if not i:
            print('No datasets installed')  # pragma: no cover
    else:
        table = Table(
            '#', 'Dataset', 'Glosses', 'Concepticon', 'Varieties', 'Glottocodes', 'Families')
        try:
            concept_counts = {r[0]: r[1:] for r in args.api.db.fetchall('concepts_by_dataset')}
        except sqlite3.OperationalError:  # pragma: no cover
            print('No datasets loaded yet')
            return

        varieties = args.api.db.varieties
        var_counts = {}
        for dsid, vs in groupby(varieties, lambda v: v.source):
            vs = list(vs)
            var_counts[dsid] = (
                len(vs), len(set(v.glottocode for v in vs)), len(set(v.family for v in vs)))

        for count, d in enumerate(args.api.db.datasets):
            table.append([
                count + 1,
                d.replace('lexibank-', ''),
                concept_counts[d][1],
                concept_counts[d][0],
                var_counts[d][0],
                var_counts[d][1],
                var_counts[d][2],
            ])
        table.append([
            '',
            'TOTAL',
            0,
            args.api.db.fetchone(
                """\
select
    count(distinct p.concepticon_id) from parametertable as p, formtable as f, languagetable as l
where
    f.parameter_id = p.id and f.dataset_id = p.dataset_id
    and f.language_id = l.id and f.dataset_id = l.dataset_id
    and l.glottocode is not null
    and l.family != 'Bookkeeping'
""")[0],
            len(varieties),
            len(set(v.glottocode for v in varieties)),
            len(set(v.family for v in varieties))
        ])
        print(table.render(tablefmt='simple'))
Example #13
0
    def test_Table(self):
        from clldutils.markup import Table

        t = Table()
        self.assertEqual(t.render(), '')

        t = Table('a', 'b', rows=[[1, 2], [3, 4]])
        self.assertEqual(
            t.render(),
            '| a | b |\n|----:|----:|\n| 1 | 2 |\n| 3 | 4 |')
        self.assertEqual(
            t.render(condensed=False),
            '|   a |   b |\n|----:|----:|\n|   1 |   2 |\n|   3 |   4 |')
        self.assertEqual(
            t.render(verbose=True),
            '| a | b |\n|----:|----:|\n| 1 | 2 |\n| 3 | 4 |\n\n(2 rows)\n\n')
        self.assertEqual(
            t.render(sortkey=itemgetter(1), reverse=True),
            '| a | b |\n|----:|----:|\n| 3 | 4 |\n| 1 | 2 |')
Example #14
0
def readme_conceptlists(api, cls, args):
    table = Table("name", "# mapped", "% mapped", "mergers")
    for cl in cls:
        args.log.info("processing <" + cl.path.name + ">")
        mapped, mapped_ratio, mergers = cl.stats()
        table.append(["[%s](%s) " % (cl.id, cl.path.name), len(mapped), mapped_ratio, len(mergers)])
    readme(
        api.data_path("conceptlists"),
        "# Concept Lists\n\n{0}".format(table.render(verbose=True, sortkey=operator.itemgetter(0))),
    )
Example #15
0
def features(args):
    bipa = TranscriptionSystem(args.system)
    features = set()
    for sound in bipa.sounds.values():
        if sound.type not in ['marker', 'unknownsound']:
            for k, v in sound.featuredict.items():
                features.add((sound.type, k, v or ''))
    table = Table('TYPE', 'FEATURE', 'VALUE')
    table.extend(sorted(features))
    print(table.render(tablefmt='simple'))
Example #16
0
def readme_concept_list_meta(api):
    """Writes statistics on metadata to readme."""
    txt = "# Basic Statistics on Metadata\n\n{0}"
    cnc = len(api.conceptsets)
    table = Table("provider", "ID", "# concept sets", "% coverage")
    for meta in api.metadata.values():
        n = len(meta.values)
        table.append([meta.meta.get("dc:title"), meta.id, n, (n / cnc) * 100])
    readme(
        api.data_path("concept_set_meta"),
        txt.format(table.render(sortkey=operator.itemgetter(1), reverse=True, condensed=False)),
    )
Example #17
0
def run(args):
    md = ['# Sources', '']
    for datatype in ['datasets', 'phylogenies']:
        md.append('\n## {0}\n'.format(datatype.capitalize()))
        t = Table('Name', 'Reference')
        for obj in getattr(args.repos, datatype):
            if not obj.id.startswith('glottolog_') or obj.id == 'glottolog_global':
                t.append([
                    '[{0}]({1}/{2})'.format(obj.name, datatype, obj.id),
                    obj.reference])
        md.append(t.render(condensed=False))
    args.repos.path('SOURCES.md').write_text('\n'.join(md), encoding='utf-8')
Example #18
0
def readme_concept_list_meta(api):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n{0}'
    cnc = len(api.conceptsets)
    table = Table('provider', 'ID', '# concept sets', '% coverage')
    for meta in api.metadata.values():
        n = len(meta.values)
        table.append([meta.meta.get('dc:title'), meta.id, n, (n / cnc) * 100])
    readme(
        api.data_path('concept_set_meta'),
        txt.format(
            table.render(sortkey=operator.itemgetter(1),
                         reverse=True,
                         condensed=False)))
Example #19
0
def readme_conceptlists(api, cls):
    table = Table('name', '# mapped', '% mapped', 'mergers')
    for cl in cls:
        print(cl.path.name)
        mapped, mapped_ratio, mergers = cl_stats(cl)
        table.append([
            '[%s](%s) ' % (cl.id, cl.path.name),
            len(mapped),
            mapped_ratio,
            len(mergers)])
    readme(
        api.data_path('conceptlists'),
        '# Concept Lists\n\n{0}'.format(
            table.render(verbose=True, sortkey=operator.itemgetter(0))))
Example #20
0
def table(args):
    tts = TranscriptionSystem(args.system)
    tts_sounds = [
        tts.get(
            sound if isinstance(sound, text_type) else sound.decode('utf8'))
        for sound in args.args
    ]
    if args.filter == 'generated':
        tts_sounds = [s for s in tts_sounds if s.generated]
    elif args.filter == 'unknown':
        tts_sounds = [s for s in tts_sounds if s.type == 'unknownsound']
    elif args.filter == 'known':
        tts_sounds = [
            s for s in tts_sounds
            if not s.generated and not s.type == 'unknownsound'
        ]

    data = defaultdict(list)
    ucount = 0
    for sound in tts_sounds:
        if sound.type != 'unknownsound':
            data[sound.type] += [sound.table]
        else:
            ucount += 1
            data['unknownsound'].append(
                [text_type(ucount), sound.source or '', sound.grapheme])
    for cls in tts.sound_classes:
        if cls in data:
            print('# {0}\n'.format(cls))
            tbl = Table(*[c.upper() for c in tts.columns[cls]], rows=data[cls])
            print(tbl.render(tablefmt=args.format, condensed=False))
            print('')
    if data['unknownsound']:
        print('# Unknown sounds\n')
        tbl = Table('NUMBER', 'SOURCE', 'GRAPHEME', rows=data['unknownsound'])
        print(tbl.render(tablefmt=args.format, condensed=False))
Example #21
0
def refsearch(args):
    """Search Glottolog references

    glottolog --repos=. refsearch "QUERY"

    E.g.:
    - glottolog refsearch "Izi provider:hh"
    - glottolog refsearch "author:Haspelmath provider:wals"
    """
    count, results = fts.search(args.repos, args.args[0])
    table = Table('ID', 'Author', 'Year', 'Title')
    for res in results:
        table.append([res.id, res.author, res.year, res.title])
    sprint(table.render(tablefmt='simple'))
    print('({} matches)'.format(count))
Example #22
0
def readme_conceptlists(api, cls):
    table = Table('name', '# mapped', '% mapped', 'mergers')
    for cl in cls:
        concepts = cl.concepts.values()
        mapped = len([c for c in concepts if c.concepticon_id])
        mapped_ratio = 0
        if concepts:
            mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.concepticon_id for c in concepts if c.concepticon_id])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])
        table.append(['[%s](%s) ' % (cl.id, cl.path.name), mapped, mapped_ratio, mergers])
    readme(
        api.data_path('conceptlists'),
        '# Concept Lists\n\n{0}'.format(
            table.render(verbose=True, sortkey=operator.itemgetter(0))))
Example #23
0
def ls(args):
    """
    lexibank ls [COLS]+

    column specification:
    - license
    - lexemes
    - macroareas
    """
    # FIXME: how to smartly choose columns?
    table = Table('ID', 'Title')
    cols = [
        col for col in args.args
        if col in ['license', 'lexemes', 'macroareas']
    ]
    tl = 40
    if args.args:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)
    for d in data_path(repos=Path(args.lexibank_repos)).iterdir():
        if is_dataset_dir(d):
            ds = Dataset(d)
            row = [d.name, short_title(ds.md['dc:title'], l=tl)]
            for col in cols:
                if col == 'license':
                    lic = licenses.find(ds.md.get('dc:license') or '')
                    row.append(lic.id if lic else ds.md.get('dc:license'))
                elif col in ['lexemes', 'macroareas']:
                    mds = list(ds.iter_cldf_metadata())
                    if col == 'lexemes':
                        row.append(
                            sum(md.notes['stats']['lexeme_count']
                                for md in mds))
                    elif col == 'macroareas':
                        mas = set()
                        for md in mds:
                            mas = mas.union(md.notes['stats']['macroareas'])
                        row.append(', '.join(sorted(mas)))

            table.append(row)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False))
Example #24
0
def metadata(args):
    """List all metadata fields used in languoid INI files and their frequency.

    glottolog metadata
    """
    ops = defaultdict(Counter)

    for l in args.repos.languoids():
        for secname, sec in l.cfg.items():
            ops[secname].update(opt for opt, val in sec.items() if val)

    ops.pop('DEFAULT', None)

    t = Table('section', 'option', 'count')
    for section, options in ops.items():
        t.append([section, '', float(sum(options.values()))])
        for k, n in options.most_common():
            t.append(['', k, float(n)])
    print(t.render(condensed=False, floatfmt=',.0f'))
Example #25
0
def metadata(args):
    """List all metadata fields used in languoid INI files and their frequency.

    glottolog metadata
    """
    ops = defaultdict(Counter)

    for l in args.repos.languoids():
        for sec in l.cfg:
            for opt in l.cfg[sec]:
                if l.cfg.get(sec, opt):
                    ops[sec].update([opt])

    t = Table('section', 'option', 'count')
    for section, options in ops.items():
        t.append([section, '', 0.0])
        for k, n in options.most_common():
            t.append(['', k, float(n)])
    print(t.render(condensed=False, floatfmt=',.0f'))
Example #26
0
def sounds(args):
    tts = TranscriptionSystem(args.system)
    data = []
    for sound in args.args:
        sound = tts.get(
            sound if isinstance(sound, text_type) else sound.decode('utf8'))
        if sound.type != 'unknownsound':
            data += [[
                text_type(sound), sound.source or ' ',
                '1' if sound.generated else ' ',
                sound.grapheme if sound.alias else ' ', sound.name
            ]]
        else:
            data += [['?', sound.source, '?', '?', '?']]
    tbl = Table(args.system.upper(),
                'SOURCE',
                'GENERATED',
                'ALIAS',
                'NAME',
                rows=data)
    print(tbl.render(tablefmt=args.format, condensed=False))
Example #27
0
def update(newbib, bibfile, log):  # pragma: no cover
    bibtool(newbib)
    stats = Counter()

    def fix_entry_type(entry):
        type_ = entry.type.lower()
        type_map = {
            'thesis': 'phdthesis',
            'mvreference': 'misc',
            'mvbook': 'book',
            'bookinbook': 'book',
            'report': 'techreport',
        }
        entry.type = type_map.get(type_, type_)
        stats.update([entry.type])

    bibfile.update(newbib, log=log)
    bibfile.visit(fix_entry_type)
    bibfile.check(log)
    res = Table('entry type', '#')
    res.extend(list(stats.most_common()))
    res.append(['TOTAL', sum(stats.values())])
    print('\n' + res.render(tablefmt='simple'))
Example #28
0
def stats(args):
    """
    cdstarcat stats

    Print summary statistics of bitstreams in the catalog to stdout.
    """
    cat = _catalog(args)
    print('Summary:')
    print('  {0:,} objects with {1:,} bitstreams of total size {2}'.format(
        len(cat), sum(len(obj.bitstreams) for obj in cat), cat.size_h))
    print('  {0} duplicate bitstreams'.format(
        sum(1 for objs in cat.md5_to_object.values() if len(objs) > 1)))
    print('  {0} objects with no bitstreams'.format(
        sum(1 for obj in cat if not obj.bitstreams)))

    print()
    types = Counter(chain(*[[bs.mimetype for bs in obj.bitstreams] for obj in cat]))
    table = Table('maintype', 'subtype', 'bitstreams')
    for maintype, items in groupby(
            sorted(types.items(), key=lambda p: (p[0].split('/')[0], -p[1])),
            lambda p: p[0].split('/')[0]):
        for k, v in items:
            table.append([maintype, k.split('/')[1], v])
    print(table.render(tablefmt='simple'))
Example #29
0
 def print_count(count):
     t = Table('concept list', 'glang count')
     for p in count.most_common(n=10):
         t.append(list(p))
     print(t.render(tablefmt='simple', condensed=False))
Example #30
0
def coverage(args):  # pragma: no cover
    from pyconcepticon.api import Concepticon

    varieties = defaultdict(set)
    glangs = defaultdict(set)
    concept_count = defaultdict(set)
    res80 = Counter()
    res85 = Counter()
    res90 = Counter()
    res80v = Counter()
    res85v = Counter()
    res90v = Counter()

    def _coverage(ds, **kw):
        ds.coverage(varieties, glangs, concept_count)

    with_dataset(args, _coverage)

    print('varieties', len(varieties))

    concepticon = Concepticon(args.cfg['paths']['concepticon'])
    for cl in concepticon.conceptlists.values():
        try:
            concepts = set(
                int(cc.concepticon_id) for cc in cl.concepts.values()
                if cc.concepticon_id)
        except:  # noqa: E722
            continue
        for varid, meanings in varieties.items():
            # compute relative size of intersection instead!
            c = len(concepts.intersection(meanings)) / len(concepts)
            if c >= 0.8:
                res80v.update([cl.id])
            if c >= 0.85:
                res85v.update([cl.id])
            if c >= 0.9:
                res90v.update([cl.id])

        for varid, meanings in glangs.items():
            # compute relative size of intersection instead!
            c = len(concepts.intersection(meanings)) / len(concepts)
            if c >= 0.8:
                res80.update([cl.id])
            if c >= 0.85:
                res85.update([cl.id])
            if c >= 0.9:
                res90.update([cl.id])

    def print_count(count):
        t = Table('concept list', 'glang count')
        for p in count.most_common(n=10):
            t.append(list(p))
        print(t.render(tablefmt='simple', condensed=False))

    print('\nGlottolog languages with coverage > 80%:')
    print_count(res80)

    print('\nGlottolog languages with coverage > 85%:')
    print_count(res85)

    print('\nGlottolog languages with coverage > 90%:')
    print_count(res90)

    print('\nVarieties with coverage > 80%:')
    print_count(res80v)

    print('\nVarieties with coverage > 85%:')
    print_count(res85v)

    print('\nVarieties with coverage > 90%:')
    print_count(res90v)

    print('\ntop-200 concepts:')
    t = Table('cid', 'gloss', 'varieties')
    for n, m in sorted([(cid, len(vars))
                        for cid, vars in concept_count.items()],
                       key=lambda i: -i[1])[:200]:
        t.append([n, concepticon.conceptsets['%s' % n].gloss, m])
    print(t.render(tablefmt='simple', condensed=False))
Example #31
0
def ls(args):
    """
    lexibank ls [COLS]+

    column specification:
    - license
    - lexemes
    - macroareas
    """
    db = Database(args.db)
    db.create(exists_ok=True)
    in_db = {
        r[0]: r[1]
        for r in db.fetchall('select id, version from dataset')
    }
    # FIXME: how to smartly choose columns?
    table = Table('ID', 'Title')
    cols = OrderedDict([(col, {}) for col in args.args if col in [
        'version',
        'location',
        'changes',
        'license',
        'all_lexemes',
        'lexemes',
        'concepts',
        'languages',
        'families',
        'varieties',
        'macroareas',
    ]])
    tl = 40
    if cols:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)

    for col, sql in [
        ('languages', 'glottocodes_by_dataset'),
        ('concepts', 'conceptsets_by_dataset'),
        ('lexemes', 'mapped_lexemes_by_dataset'),
        ('all_lexemes', 'lexemes_by_dataset'),
        ('macroareas', 'macroareas_by_dataset'),
        ('families', 'families_by_dataset'),
    ]:
        if col in cols:
            cols[col] = {r[0]: r[1] for r in db.fetchall(sql)}
    for ds in args.cfg.datasets:
        row = [
            colored(ds.id, 'green' if ds.id in in_db else 'red'),
            truncate_with_ellipsis(ds.metadata.title or '', width=tl),
        ]
        for col in cols:
            if col == 'version':
                row.append(git_hash(ds.dir))
            elif col == 'location':
                row.append(colored(str(ds.dir), 'green'))
            elif col == 'changes':
                row.append(ds.git_repo.is_dirty())
            elif col == 'license':
                lic = licenses.find(ds.metadata.license or '')
                row.append(lic.id if lic else ds.metadata.license)
            elif col in [
                    'languages', 'concepts', 'lexemes', 'all_lexemes',
                    'families'
            ]:
                row.append(float(cols[col].get(ds.id, 0)))
            elif col == 'macroareas':
                row.append(', '.join(
                    sorted((cols[col].get(ds.id) or '').split(','))))
            else:
                row.append('')

        table.append(row)
    totals = ['zztotal', len(args.cfg.datasets)]
    for i, col in enumerate(cols):
        if col in ['lexemes', 'all_lexemes']:
            totals.append(sum([r[i + 2] for r in table]))
        elif col == 'languages':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct glottocode) FROM languagetable")
                    [0]))
        elif col == 'concepts':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct concepticon_id) FROM parametertable"
                    )[0]))
        elif col == 'families':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct family) FROM languagetable")
                    [0]))
        else:
            totals.append('')
    table.append(totals)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False,
                     floatfmt=',.0f'))
Example #32
0
def run(args):
    db = get_db(args)
    in_db = {r[0]: r[1] for r in db.fetchall('select id, version from dataset')}

    table = Table('ID', 'Title')
    cols = collections.OrderedDict([
        (col, {}) for col in COLS if getattr(args, col, None) or args.all])
    tl = 40
    if cols:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)

    for col, sql in [
        ('languages', 'glottocodes_by_dataset'),
        ('concepts', 'conceptsets_by_dataset'),
        ('lexemes', 'mapped_lexemes_by_dataset'),
        ('all_lexemes', 'lexemes_by_dataset'),
        ('macroareas', 'macroareas_by_dataset'),
        ('families', 'families_by_dataset'),
    ]:
        if col in cols:
            cols[col] = {r[0]: r[1] for r in db.fetchall(sql)}
    datasets = get_datasets(args)
    for ds in datasets:
        row = [
            termcolor.colored(ds.id, 'green' if ds.id in in_db else 'red'),
            textwrap.shorten(ds.metadata.title or '', width=tl),
        ]
        for col in cols:
            if col == 'version':
                row.append(ds.repo.hash())
            elif col == 'location':
                row.append(termcolor.colored(str(ds.dir), 'green'))
            elif col == 'changes':
                row.append(ds.repo.is_dirty())
            elif col == 'license':
                lic = licenses.find(ds.metadata.license or '')
                row.append(lic.id if lic else ds.metadata.license)
            elif col in ['languages', 'concepts', 'lexemes', 'all_lexemes', 'families']:
                row.append(float(cols[col].get(ds.id, 0)))
            elif col == 'macroareas':
                row.append(', '.join(sorted((cols[col].get(ds.id) or '').split(','))))
            else:
                row.append('')

        table.append(row)
    totals = ['zztotal', len(datasets)]
    for i, col in enumerate(cols):
        if col in ['lexemes', 'all_lexemes']:
            totals.append(sum([r[i + 2] for r in table]))
        elif col == 'languages':
            totals.append(float(db.fetchone(
                "SELECT count(distinct glottocode) FROM languagetable")[0]))
        elif col == 'concepts':
            totals.append(float(db.fetchone(
                "SELECT count(distinct concepticon_id) FROM parametertable")[0]))
        elif col == 'families':
            totals.append(float(db.fetchone(
                "SELECT count(distinct family) FROM languagetable")[0]))
        else:
            totals.append('')
    table.append(totals)
    print(table.render(
        tablefmt='simple', sortkey=lambda r: r[0], condensed=False, floatfmt=',.0f'))
Example #33
0
def colexification(args):
    args.api._log = args.log
    threshold = args.threshold or 1
    edgefilter = args.edgefilter
    words = {}

    def clean(word):
        return ''.join([w for w in word if w not in '/,;"'])

    varieties = args.api.db.varieties
    lgeo = geojson.FeatureCollection([v.as_geojson() for v in varieties])
    args.api.json_dump(lgeo, 'app', 'source', 'langsGeo.json')

    app_source = args.api.existing_dir('app', 'source')
    for p in Path(__file__).parent.joinpath('app').iterdir():
        target_dir = app_source.parent if p.suffix == '.html' else app_source
        shutil.copy(str(p), str(target_dir / p.name))

    args.log.info('Adding nodes to the graph')
    G = nx.Graph()
    for concept in args.api.db.iter_concepts():
        G.add_node(concept.id, **concept.as_node_attrs())

    args.log.info('Adding edges to the graph')
    for v_, forms in tqdm(args.api.db.iter_wordlists(varieties), total=len(varieties), leave=False):
        cols = full_colexification(forms)

        for k, v in cols.items():
            for formA, formB in combinations(v, r=2):
                # check for identical concept resulting from word-variants
                if formA.concepticon_id != formB.concepticon_id:
                    words[formA.gid] = [formA.clics_form, formA.form]
                    if not G[formA.concepticon_id].get(formB.concepticon_id, False):
                        G.add_edge(
                            formA.concepticon_id,
                            formB.concepticon_id,
                            words=set(),
                            languages=set(),
                            families=set(),
                            wofam=[],
                        )

                    G[formA.concepticon_id][formB.concepticon_id]['words'].add(
                        (formA.gid, formB.gid))
                    G[formA.concepticon_id][formB.concepticon_id]['languages'].add(v_.gid)
                    G[formA.concepticon_id][formB.concepticon_id]['families'].add(v_.family)
                    G[formA.concepticon_id][formB.concepticon_id]['wofam'].append('/'.join([
                        formA.gid,
                        formB.gid,
                        formA.clics_form,
                        v_.gid,
                        v_.family,
                        clean(formA.form),
                        clean(formB.form)]))
    args.api.json_dump(words, 'app', 'source', 'words.json')

    edges = {}
    for edgeA, edgeB, data in G.edges(data=True):
        edges[edgeA, edgeB] = (len(data['families']), len(data['languages']), len(data['words']))

    ignore_edges = []
    for edgeA, edgeB, data in G.edges(data=True):
        data['WordWeight'] = len(data['words'])
        data['words'] = ';'.join(sorted(['{0}/{1}'.format(x, y) for x, y in data['words']]))
        data['FamilyWeight'] = len(data['families'])
        data['families'] = ';'.join(sorted(data['families']))
        data['LanguageWeight'] = len(data['languages'])
        data['languages'] = ';'.join(data['languages'])
        data['wofam'] = ';'.join(data['wofam'])
        if edgefilter == 'families' and data['FamilyWeight'] < threshold:
            ignore_edges.append((edgeA, edgeB))
        elif edgefilter == 'languages' and data['LanguageWeight'] < threshold:
            ignore_edges.append((edgeA, edgeB))
        elif edgefilter == 'words' and data['WordWeight'] < threshold:
            ignore_edges.append((edgeA, edgeB))

    G.remove_edges_from(ignore_edges)

    nodenames = {r[0]: r[1] for r in args.api.db.fetchall(
        "select distinct concepticon_id, concepticon_gloss from parametertable")}

    table = Table('ID A', 'Concept A', 'ID B', 'Concept B', 'Families', 'Languages', 'Words')
    count = 0
    for (nodeA, nodeB), (fc, lc, wc) in sorted(edges.items(), key=lambda i: i[1], reverse=True):
        if (nodeA, nodeB) not in ignore_edges:
            table.append([nodeA, nodenames[nodeA], nodeB, nodenames[nodeB], fc, lc, wc])
            count += 1
        if count >= 10:
            break
    print(table.render(tablefmt='simple'))

    args.api.save_graph(G, args.graphname or 'network', threshold, edgefilter)
Example #34
0
    def run(self, **cfg):
        cfg.setdefault('column', 'Value')
        cfg.setdefault('segmentized', False)
        self.report = defaultdict(lambda: dict(
            invalid=Counter(),
            segments=Counter(),
            lingpy_errors=set(),
            clpa_errors=set(),
            replacements=defaultdict(set),
            general_errors=0,
            word_errors=0,
            bad_words=[],
            segment_types=Counter(),
        ))
        bad_words = []
        with tqdm(total=len(list(self.dataset.iter_cldf_metadata())),
                  desc='cldf-ds',
                  leave=False) as pbar:
            for ds in self.dataset.iter_cldf_datasets():
                bad_words.extend(
                    test_sequences(ds, get_variety_id, self.report, **cfg))
                pbar.update(1)

        stats = dict(
            invalid=set(),
            tokens=0,
            segments=set(),
            lingpy_errors=set(),
            clpa_errors=set(),
            replacements=defaultdict(set),
            inventory_size=0,
            general_errors=0,
            word_errors=0,
            bad_words=[],
            segment_types=Counter(),
        )
        for lid, report in self.report.items():
            stats['invalid'].update(report['invalid'])
            stats['tokens'] += sum(report['segments'].values())
            stats['segments'].update(report['segments'].keys())

            for segment, count in report['segments'].items():
                stats['segment_types'][segment] += count

            stats['general_errors'] += report['general_errors']
            stats['word_errors'] += report['word_errors']
            stats['bad_words'] += report['bad_words']
            for attr in ['lingpy_errors', 'clpa_errors']:
                stats[attr].update(report[attr])
            for segment, repls in report['replacements'].items():
                stats['replacements'][segment].update(repls)
            stats['inventory_size'] += len(report['segments']) / len(
                self.report)
            # make sure we can serialize as JSON:
            for attr in ['lingpy_errors', 'clpa_errors']:
                report[attr] = sorted(report[attr])
            for segment in report['replacements']:
                report['replacements'][segment] = sorted(
                    report['replacements'][segment])
        # make sure we can serialize as JSON:
        for attr in ['lingpy_errors', 'clpa_errors']:
            stats[attr + '_types'] = sorted(stats[attr])
        for attr in ['invalid', 'segments', 'lingpy_errors', 'clpa_errors']:
            stats[attr] = len(stats[attr])
        for segment in stats['replacements']:
            stats['replacements'][segment] = sorted(
                stats['replacements'][segment])

        self.report['stats'] = stats
        jsonlib.dump(self.report, self.fname, indent=4)

        if not cfg.get('segmentized'):
            return

        segments = Table('Segment', 'Occurrence', 'LingPy', 'CLPA')
        for a, b in sorted(stats['segment_types'].items(),
                           key=lambda x: (-x[1], x[0])):
            c, d = '✓', '✓'
            if a in stats['clpa_errors_types']:
                c = '✓' if a not in stats['lingpy_errors_types'] else '?'
                d = ', '.join(stats['clpa_errors_types'][a]) \
                    if a not in stats['clpa_errors_types'] else '?'
            segments.append([a, b, c, d])

        words = Table('ID', 'LANGUAGE', 'CONCEPT', 'VALUE', 'SEGMENTS')
        with tqdm(total=len(bad_words), desc='bad-lexemes',
                  leave=False) as pbar:
            for i, row in enumerate(bad_words):
                analyzed = []
                for segment in row[cfg['column']].split(' '):
                    if segment in stats['lingpy_errors_types'] \
                            or segment in stats['clpa_errors_types']:
                        analyzed.append('<s> %s </s>' % segment)
                    else:
                        analyzed.append(segment)
                words.append([
                    row['ID'], row['Language_name'], row['Parameter_name'],
                    row['Value'], ' '.join(analyzed)
                ])
                if i % 10 == 0:
                    pbar.update(10)
        return """\
# Detailed transcription record

## Segments

{0}
## Words

{1}""".format(segments.render(verbose=True), words.render(verbose=True))