Exemple #1
0
def cldf(args):
    """
    Create CLDF datasets from the raw data for a dataset.

    lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID]
    """
    if not args.glottolog_repos or not Path(args.glottolog_repos).exists():
        raise ParserError('Invalid glottolog repository path given')

    if not args.concepticon_repos or not Path(args.concepticon_repos).exists():
        raise ParserError('Invalid concepticon repository path given')

    # FIXME: get dict of all glottolog langs right here, and attach to datasets!
    try:
        languoids = load('glottolog')
    except ValueError:
        languoids = {
            l.id: l
            for l in Glottolog(args.glottolog_repos).languoids()
        }
        dump(languoids, 'glottolog')

    def _cldf(ds, **kw):
        ds.glottolog_languoids = languoids
        ds.cldf(**kw)
        ds.write_cognates()

    with_dataset(args, _cldf)
Exemple #2
0
def load(args):
    """
    clics load /path/to/concepticon-data /path/to/glottolog
    """
    if len(args.args) != 2:
        raise ParserError(
            'concepticon and glottolog repos locations must be specified!')
    concepticon = Path(args.args[0])
    if not concepticon.exists():
        raise ParserError('concepticon repository does not exist')
    glottolog = Path(args.args[1])
    if not glottolog.exists():
        raise ParserError('glottolog repository does not exist')

    args.api.db.create(exists_ok=True)
    args.log.info('loading datasets into {0}'.format(args.api.db.fname))
    in_db = args.api.db.datasets
    for ds in iter_datasets():
        if args.unloaded and ds.id in in_db:
            args.log.info('skipping {0} - already loaded'.format(ds.id))
            continue
        args.log.info('loading {0}'.format(ds.id))
        args.api.db.load(ds)
    args.log.info('loading Concepticon data')
    args.api.db.load_concepticon_data(Concepticon(str(concepticon)))
    args.log.info('loading Glottolog data')
    args.api.db.load_glottolog_data(Glottolog(str(glottolog)))
    return
Exemple #3
0
def existing_lang(args):
    if not args.args:
        raise ParserError('No languoid specified')
    lang = args.repos.languoid(args.args[0])
    if not lang:
        raise ParserError('Invalid languoid spec')
    return lang
Exemple #4
0
def _get_dataset(args):
    if len(args.args) < 1:
        raise ParserError('not enough arguments')
    fname = Path(args.args[0])
    if not fname.exists() or not fname.is_file():
        raise ParserError('%s is not an existing directory' % fname)
    if fname.suffix == '.json':
        return Dataset.from_metadata(fname)
    return Dataset.from_data(fname)
Exemple #5
0
def new_dataset(args):
    """
    lexibank new-dataset OUTDIR [ID]
    """
    if not args.args:
        raise ParserError('you must specify an existing directory')
    outdir = Path(args.args.pop(0))
    if not outdir.exists():
        raise ParserError('you must specify an existing directory')

    id_pattern = re.compile('[a-z_0-9]+$')
    md = {}
    if args.args:
        md['id'] = args.args.pop(0)
    else:
        md['id'] = input('Dataset ID: ')

    while not id_pattern.match(md['id']):
        print(
            'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!'
        )
        md['id'] = input('Dataset ID: ')

    outdir = outdir / md['id']
    if not outdir.exists():
        outdir.mkdir()

    for key in ['title', 'url', 'license', 'conceptlist', 'citation']:
        md[key] = input('Dataset {0}: '.format(key))

    # check license!
    # check conceptlist!

    for path in Path(
            pylexibank.__file__).parent.joinpath('dataset_template').iterdir():
        if path.is_file():
            if path.suffix in ['.pyc']:
                continue  # pragma: no cover
            target = path.name
            content = read_text(path)
            if '+' in path.name:
                target = re.sub('\+([a-z]+)\+',
                                lambda m: '{' + m.groups()[0] + '}',
                                path.name).format(**md)
            if target.endswith('_tmpl'):
                target = target[:-5]
                content = content.format(**md)
            write_text(outdir / target, content)
        else:
            target = outdir / path.name
            if target.exists():
                shutil.rmtree(str(target))
            shutil.copytree(str(path), str(target))
    del md['id']
    jsonlib.dump(md, outdir / 'metadata.json', indent=4)
Exemple #6
0
def run(args):
    bipa = args.clts.api.bipa
    func = profile.simple_profile
    cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints']
    kw = {'ref': 'form', 'clts': bipa}
    if args.context:
        func = profile.context_profile
        cols = [
            'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence',
            'Codepoints'
        ]
        kw['col'] = 'language_id'

    ds = get_dataset(args)
    profile_path = ds.etc_dir / 'orthography.tsv'
    if profile_path.exists() and not args.force:
        raise ParserError(
            'Orthography profile exists already. To overwrite, pass "-f" flag')

    header, D = [], {}
    for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1):
        if i == 1:
            header = [f for f in row.keys() if f != 'ID']
            D = {0: ['lid'] + [h.lower() for h in header]}

        row['Segments'] = ' '.join(row['Segments'])
        D[i] = [row['ID']] + [row[h] for h in header]

    with UnicodeWriter(profile_path, delimiter='\t') as writer:
        writer.writerow(cols)
        for row in func(Wordlist(D, row='parameter_id', col='language_id'),
                        **kw):
            writer.writerow(row)
    args.log.info('Orthography profile written to {0}'.format(profile_path))
 def cmd(args):
     """
     docstring
     """
     if len(args.args) < 1:
         raise ParserError('not enough arguments')
     print(args.args[0])
Exemple #8
0
def with_dataset(args: argparse.Namespace, func: typing.Union[callable, str], dataset=None) \
        -> typing.Any:
    """
    Run a callable, passing a dataset and `args` as arguments, returning it's result.

    :param args: CLI arguments
    :param func: Callable with suitable signature or `str`, in which case a method `_cmd_<name>` \
    will be looked up on the dataset and run.
    :param dataset: `cldfbench.Dataset` instance or `None`, in which case a dataset will be \
    retrieved as specified by `args`.
    """
    dataset = dataset or get_dataset(args)
    s = time()
    arg = [dataset]
    if isinstance(func, str):
        func_ = getattr(dataset, '_cmd_' + func,
                        getattr(dataset, 'cmd_' + func, None))
        if not func_:
            raise ParserError('Dataset {0} has no {1} command'.format(
                dataset.id, func))
        func, arg = func_, []
    args.log.info('running {0} on {1} ...'.format(
        getattr(func, '__name__', func), dataset.id))
    res = func(*arg, args)
    args.log.info('... done %s [%.1f secs]' % (dataset.id, time() - s))
    return res
Exemple #9
0
def get_dataset(args, name=None):
    name = name or args.args[0]
    dir_ = Path(name)
    if not is_dataset_dir(dir_):
        dir_ = data_path(name, repos=args.lexibank_repos)
        if not is_dataset_dir(dir_):
            raise ParserError('invalid dataset spec')
    return Dataset(dir_)
Exemple #10
0
def get_dataset(args):
    ds = _get(args.dataset, ep=args.entry_point)
    if ds:
        return ds
    raise ParserError(
        termcolor.colored(
            '\nInvalid dataset spec: <{0}> {1}\n'.format(
                args.entry_point, args.dataset), "red"))
Exemple #11
0
def newick(args):
    parser = argparse.ArgumentParser(prog='newick')
    parser.add_argument('root', nargs='?', default=None, help='root node')
    parser.add_argument('--template', help='node label template', default=None)
    xargs = parser.parse_args(args.args)
    if xargs.root and not args.repos.languoid(xargs.root):
        raise ParserError('Invalid root node {0}'.format(xargs.root))
    sprint(args.repos.newick_tree(xargs.root, template=xargs.template))
Exemple #12
0
def get_datasets(args):
    if args.glob or args.dataset == '_':
        args.dataset = args.dataset.replace('_', '*')
    res = _gets(args.dataset, ep=args.entry_point, glob=args.glob)
    if res:
        return res
    raise ParserError(termcolor.colored(
        '\nInvalid dataset spec: <{0}> {1}\n'.format(args.entry_point, args.dataset), "red"))
Exemple #13
0
def check(args):
    """
    clpa check <STRING>
    """
    if len(args.args) != 1:
        raise ParserError('only one argument allowed')
    check = check_string(args.args[0], load_whitelist())
    print('\t'.join(args.args[0].split(' ')))
    print('\t'.join(check))
Exemple #14
0
def glottolog_(args):
    """Update data derived from Glottolog

    dplace glottolog PATH/TO/GLOTTOLOG/REPOS YEAR VERSION
    """
    if len(args.args) != 3:
        raise ParserError('not enough arguments')
    year, version = args.args[1:3]
    title = "Glottolog {0}".format(version)
    glottolog.update(args.repos, args.args[0], year, title)
Exemple #15
0
def run(args):
    try:
        fts.get_index(args.repos, must_exist=True)
    except ValueError:
        raise ParserError('Index does not exist. Run "glottolog searchindex" first!')
    count, results = fts.search(args.repos, args.query)
    with Table('ID', 'Author', 'Year', 'Title') as table:
        for res in results:
            table.append([res.id, res.author, res.year, res.title])
    print('({} matches)'.format(count))
Exemple #16
0
def tokenize(args):
    """
    Tokenize a string (passed as argument or read from stdin)

    segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
    """
    if args.profile and not Path(args.profile).exists():  # pragma: no cover
        raise ParserError('--profile must be a path for an existing file')
    _write(args,
           Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
Exemple #17
0
def configure(cfgpath=None):
    """
    Configure lexibank.

    :return: a pair (config, logger)
    """
    cfgpath = Path(cfgpath) \
        if cfgpath else Path(user_config_dir(pylexibank.__name__)) / 'config.ini'
    if not cfgpath.exists():
        print("""
{0}

You seem to be running lexibank for the first time.
Your system configuration will now be written to a config file to be used
whenever lexibank is run lateron.
""".format(colored('Welcome to lexibank!', 'blue', attrs=['bold', 'reverse'])))
        if not cfgpath.parent.exists():
            cfgpath.parent.mkdir(parents=True)
        cfg = Config()
        cfg['paths'] = {k: get_path(src) for k, src in REPOS}
        cfg.write(cfgpath)
        print("""
Configuration has been written to:
{0}
You may edit this file to adapt to changes in your system or to reconfigure settings
such as the logging level.""".format(cfgpath.resolve()))
    else:
        cfg = Config.from_file(cfgpath)

    try:
        cfg.glottolog
    except (FileNotFoundError, ValueError):
        raise ParserError(
            'Misconfigured Glottolog path in {0}'.format(cfgpath))
    if not Path(cfg['paths']['concepticon']).exists():
        raise ParserError(
            'Misconfigured Concepticon path in {0}'.format(cfgpath))

    # Print the configuration directory for reference:
    print("Using configuration file at:")
    print(str(cfgpath) + '\n')
    return cfg
Exemple #18
0
def run(args):  # pragma: no cover
    try:
        args.version = assert_release(args.repos.repos)
    except AssertionError:
        raise ParserError('glottolog-data must be checked out at release tag!')

    _, settings = get_env_and_settings(
        str(args.pkg_dir.parent / 'development.ini'))
    with FreshDB.from_settings(settings, log=args.log):
        dbload(args)
        dbprime(args)
Exemple #19
0
def with_dataset(args, func, dataset=None):
    dataset = dataset or get_dataset(args)
    s = time()
    arg = [dataset]
    if isinstance(func, str):
        func_ = getattr(dataset, '_cmd_' + func, getattr(dataset, 'cmd_' + func, None))
        if not func_:
            raise ParserError('Dataset {0} has no {1} command'.format(dataset.id, func))
        func, arg = func_, []
    args.log.info('running {0} on {1} ...'.format(getattr(func, '__name__', func), dataset.id))
    func(*arg, args)
    args.log.info('... done %s [%.1f secs]' % (dataset.id, time() - s))
Exemple #20
0
def itemise(args):
    if len(args.args) != 1:
        raise ParserError("need a value to itemise")

    for ds in sorted(args.repos.datasets):
        d = args.repos.datasets[ds]

        try:
            dvalue = getattr(d, args.args[0])
        except AttributeError:
            dvalue = d.details.get(args.args[0], None)

        print("%s = %s" % (ds.ljust(40), dvalue))
Exemple #21
0
def run(args):
    if Glottocode.pattern.match(args.parent):
        args.parent = get_languoid(args, args.parent).dir
    else:
        args.parent = pathlib.Path(args.parent)
        if not args.parent.exists():
            raise ParserError('invalid parent dir specified')

    lang = Languoid.from_name_id_level(
        args.parent, args.name, args.repos.glottocodes.new(args.name),
        args.level, **dict(prop.split('=') for prop in args.props))

    print("Info written to %s" % lang.write_info(outdir=args.parent))
Exemple #22
0
def get_dataset(args: argparse.Namespace) -> cldfbench.Dataset:
    """
    Get the `cldfbench.Dataset` specified by `args`.

    :raises ParserError: If no matching dataset was found.
    """
    ds = _get(args.dataset, ep=args.entry_point)
    if ds:
        return ds
    raise ParserError(
        termcolor.colored(
            '\nInvalid dataset spec: <{0}> {1}\n'.format(
                args.entry_point, args.dataset), "red"))
Exemple #23
0
def get_datasets(args: argparse.Namespace) -> typing.List[cldfbench.Dataset]:
    """
    Get the `cldfbench.Dataset` s specified by `args`.

    :raises ParserError: If no matching datasets were found.
    """
    if args.glob or args.dataset == '_':
        args.dataset = args.dataset.replace('_', '*')
    res = _gets(args.dataset, ep=args.entry_point, glob=args.glob)
    if res:
        return res
    raise ParserError(
        termcolor.colored(
            '\nInvalid dataset spec: <{0}> {1}\n'.format(
                args.entry_point, args.dataset), "red"))
Exemple #24
0
def recode(args):
    """Assign a new glottocode to an existing languoid.

    glottolog recode <code>
    """
    lang = args.repos.languoid(args.args[0])
    if not lang:
        raise ParserError('languoid not found')
    lang.id = Glottocode.from_name(lang.name)
    new_dir = lang.dir.parent.joinpath(lang.id)
    copytree(lang.dir, new_dir)
    lang.write_info(new_dir)
    remove(new_dir.joinpath('%s.ini' % args.args[0]))
    rmtree(lang.dir)
    print("%s -> %s" % (args.args[0], lang.id))
Exemple #25
0
def link(args):
    """
    Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or
    CONCEPTICON_ID is given, the other is added.

    concepticon link <concept-list>
    """
    api = Concepticon(args.data)
    conceptlist = Path(args.args[0])
    if not conceptlist.exists() or not conceptlist.is_file():
        conceptlist = api.data_path('conceptlists', args.args[0])
        if not conceptlist.exists() or not conceptlist.is_file():
            raise ParserError('no file %s found' % args.args[0])

    rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
Exemple #26
0
def register(parser):
    try:
        from cdstarcat.catalog import Catalog
    except ImportError:
        raise ParserError('pip install cdstarcat')
    parser.add_argument('version', help="version number without 'v' prefix")
    parser.add_argument('--catalog',
                        type=pathlib.Path,
                        default=pathlib.Path(os.environ['CDSTAR_CATALOG']))
    parser.add_argument('--url', default=os.environ['CDSTAR_URL'])
    parser.add_argument('--user', default=os.environ['CDSTAR_USER'])
    parser.add_argument('--pwd', default=os.environ['CDSTAR_PWD'])
    parser.add_argument('--catalog_class',
                        help=argparse.SUPPRESS,
                        default=Catalog)
Exemple #27
0
def createdb(args):
    """
    cldf createdb <DATASET> <SQLITE_DB_PATH>

    Load CLDF dataset <DATASET> into a SQLite DB, where <DATASET> may be the path to
    - a CLDF metadata file
    - a CLDF core data file
    """
    if len(args.args) < 2:
        raise ParserError('not enough arguments')
    db = Database(args.args[1])
    db.create()
    ds = _get_dataset(args)
    db.load(ds)
    args.log.info('{0} loaded in {1}'.format(ds, db.fname))
Exemple #28
0
def dbinit(args):
    """
    glottolog-app dbinit VERSION
    """
    if not args.args:
        raise ParserError('not enough arguments')
    args.log.info('dropping DB {0}'.format(DB))
    try:
        subprocess.check_call(['dropdb', DB])
    except subprocess.CalledProcessError:
        args.log.error(
            'could not drop DB, maybe other processes are still accessing it.')
        return
    args.log.info('creating DB {0}'.format(DB))
    subprocess.check_call(['createdb', DB])
    dbload(args)
    dbprime(args)
Exemple #29
0
def beast2chars(args):
    import xml.etree.ElementTree as ElementTree

    def find_filter(node):  # note recursive
        for child in node:
            find_filter(child)
            (p, x, y) = get_partition(node)
            if p and x and y:
                return (p, x, y)

    def get_partition(p):
        x, y = [int(_) for _ in p.get('filter').split("-")]
        return (p.get('id'), x, y)

    def printchar(p, x, y, ascertained=False):
        n = 1
        for i in range(x, y + 1):
            label = "%s-%s" % (p, 'ascertained'
                               if n == 1 and ascertained else str(n))
            print(i, label)
            n += 1

    def get_by_id(data_id):
        if data_id.startswith("@"):
            data_id = data_id.lstrip("@")
        return xml.find(".//alignment[@id='%s']" % data_id)

    if len(args.args) != 1:
        raise ParserError("need an XML filename")

    xml = ElementTree.parse(args.args[0])

    for treelh in xml.findall(".//distribution[@spec='TreeLikelihood']"):
        if treelh.get('data'):
            data = get_by_id(treelh.get('data'))
            ascertained = data.get('ascertained') == 'true'
            printchar(*get_partition(data.find('./data')),
                      ascertained=ascertained)
        else:
            data = treelh.find('./data')
            ascertained = data.get('ascertained') == 'true'
            if data.get('data'):
                datadata = get_by_id(data.get('data'))
            else:
                datadata = treelh.find('./data/data')
            printchar(*get_partition(datadata), ascertained=ascertained)
Exemple #30
0
def run(args):
    # Note: Due to https://github.com/concepticon/pyconcepticon/issues/10 we require specification
    # of an output file on Windows:
    if platform.system() == 'Windows' and not args.output:  # pragma: no cover
        raise ParserError(
            'On Windows you must specify an output file since printing to the terminal may '
            'not work')

    args.repos.map(
        get_conceptlist(args, path_only=True),
        otherlist=_get_conceptlist(args.reference_list, args, path_only=True)
        if args.reference_list else None,
        out=args.output,
        full_search=args.full_search,
        language=args.language,
        skip_multiple=args.skip_multimatch,
    )