Example #1
0
def clean(args):
    """
    Remove CLDF formatted data for given dataset.

    lexibank clean [DATASET_ID]
    """
    with_dataset(args, Dataset._clean)
Example #2
0
def bib(args):
    gbib = BibliographyData()

    def _harvest(ds, **kw):
        for bib in ds.cldf_dir.glob('*.bib'):
            bib = parse_file(str(bib))
            for id_, entry in bib.entries.items():
                id_ = '{0}:{1}'.format(ds.id, id_)
                if id_ not in gbib.entries:
                    gbib.add_entry(id_, entry)

    with_dataset(args, _harvest, default_to_all=True)
    gbib.to_file(
        str(Path(args.cfg['paths']['lexibank']).joinpath('lexibank.bib')))
Example #3
0
def diff(args):
    def _diff(ds, **kw):
        repo = ds.git_repo
        if repo and repo.is_dirty():
            print('{0} at {1}'.format(colored(ds.id, 'blue', attrs=['bold']),
                                      colored(str(ds.dir), 'blue')))
            for i, item in enumerate(repo.index.diff(None)):
                if i == 0:
                    print(colored('modified:', attrs=['bold']))
                print(colored(item.a_path, 'green'))
            for i, path in enumerate(repo.untracked_files):
                if i == 0:
                    print(colored('untracked:', attrs=['bold']))
                print(colored(path, 'green'))
            print()

    if not args.args:
        args.args = [ds.id for ds in args.cfg.datasets]
    with_dataset(args, _diff)
Example #4
0
def coverage(args):  # pragma: no cover
    from pyconcepticon.api import Concepticon

    varieties = defaultdict(set)
    glangs = defaultdict(set)
    concept_count = defaultdict(set)
    res80 = Counter()
    res85 = Counter()
    res90 = Counter()
    res80v = Counter()
    res85v = Counter()
    res90v = Counter()

    def _coverage(ds, **kw):
        ds.coverage(varieties, glangs, concept_count)

    with_dataset(args, _coverage)

    print('varieties', len(varieties))

    concepticon = Concepticon(args.cfg['paths']['concepticon'])
    for cl in concepticon.conceptlists.values():
        try:
            concepts = set(
                int(cc.concepticon_id) for cc in cl.concepts.values()
                if cc.concepticon_id)
        except:  # noqa: E722
            continue
        for varid, meanings in varieties.items():
            # compute relative size of intersection instead!
            c = len(concepts.intersection(meanings)) / len(concepts)
            if c >= 0.8:
                res80v.update([cl.id])
            if c >= 0.85:
                res85v.update([cl.id])
            if c >= 0.9:
                res90v.update([cl.id])

        for varid, meanings in glangs.items():
            # compute relative size of intersection instead!
            c = len(concepts.intersection(meanings)) / len(concepts)
            if c >= 0.8:
                res80.update([cl.id])
            if c >= 0.85:
                res85.update([cl.id])
            if c >= 0.9:
                res90.update([cl.id])

    def print_count(count):
        t = Table('concept list', 'glang count')
        for p in count.most_common(n=10):
            t.append(list(p))
        print(t.render(tablefmt='simple', condensed=False))

    print('\nGlottolog languages with coverage > 80%:')
    print_count(res80)

    print('\nGlottolog languages with coverage > 85%:')
    print_count(res85)

    print('\nGlottolog languages with coverage > 90%:')
    print_count(res90)

    print('\nVarieties with coverage > 80%:')
    print_count(res80v)

    print('\nVarieties with coverage > 85%:')
    print_count(res85v)

    print('\nVarieties with coverage > 90%:')
    print_count(res90v)

    print('\ntop-200 concepts:')
    t = Table('cid', 'gloss', 'varieties')
    for n, m in sorted([(cid, len(vars))
                        for cid, vars in concept_count.items()],
                       key=lambda i: -i[1])[:200]:
        t.append([n, concepticon.conceptsets['%s' % n].gloss, m])
    print(t.render(tablefmt='simple', condensed=False))
Example #5
0
def check_phonotactics(args):
    """Check the segmented forms of a dataset"""
    with_dataset(args, Dataset._check_phonotactics)
Example #6
0
def check_profile(args):
    """Check orthography of a dataset"""
    with_dataset(args, Dataset._check_profile)
Example #7
0
def makecldf(args):
    """Convert a dataset into CLDF

    lexibank makecldf DATASET_ID
    """
    with_dataset(args, Dataset._install)
Example #8
0
def download(args):
    """Run a dataset's download command

    lexibank download DATASET_ID
    """
    with_dataset(args, Dataset._download)
Example #9
0
def unload(args):
    with_dataset(args, _unload, default_to_all=True)
Example #10
0
from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
from prompt_toolkit.completion import Completer, Completion
from termcolor import colored
from appdirs import user_data_dir
from clldutils.path import Path
from clldutils.clilib import command

from pylexibank.util import aligned
from pylexibank.commands.util import with_dataset, _load, _unload
from pylexibank.dataset import Dataset

commands = {
    'quit':
    lambda args: None,
    'download':
    lambda args: with_dataset(args, Dataset._download),
    'makecldf':
    lambda args: with_dataset(args, Dataset._install),
    'dbload':
    lambda args: with_dataset(args, _load),
    'dbunload':
    lambda args: with_dataset(args, _unload),
    'orthography':
    lambda args: None,
    'help':
    lambda args: print("Available Commands: \n%s" % aligned([
        (k, getattr(v, '__doc__', '')) for k, v in sorted(commands.items())
    ])),
}
commands['quit'].__doc__ = ': exits lexibank curator'
commands['download'].__doc__ = "<dataset> : run <dataset>'s download method"