def cldf(args): """ Create CLDF datasets from the raw data for a dataset. lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID] """ if not args.glottolog_repos or not Path(args.glottolog_repos).exists(): raise ParserError('Invalid glottolog repository path given') if not args.concepticon_repos or not Path(args.concepticon_repos).exists(): raise ParserError('Invalid concepticon repository path given') # FIXME: get dict of all glottolog langs right here, and attach to datasets! try: languoids = load('glottolog') except ValueError: languoids = { l.id: l for l in Glottolog(args.glottolog_repos).languoids() } dump(languoids, 'glottolog') def _cldf(ds, **kw): ds.glottolog_languoids = languoids ds.cldf(**kw) ds.write_cognates() with_dataset(args, _cldf)
def load(args): """ clics load /path/to/concepticon-data /path/to/glottolog """ if len(args.args) != 2: raise ParserError( 'concepticon and glottolog repos locations must be specified!') concepticon = Path(args.args[0]) if not concepticon.exists(): raise ParserError('concepticon repository does not exist') glottolog = Path(args.args[1]) if not glottolog.exists(): raise ParserError('glottolog repository does not exist') args.api.db.create(exists_ok=True) args.log.info('loading datasets into {0}'.format(args.api.db.fname)) in_db = args.api.db.datasets for ds in iter_datasets(): if args.unloaded and ds.id in in_db: args.log.info('skipping {0} - already loaded'.format(ds.id)) continue args.log.info('loading {0}'.format(ds.id)) args.api.db.load(ds) args.log.info('loading Concepticon data') args.api.db.load_concepticon_data(Concepticon(str(concepticon))) args.log.info('loading Glottolog data') args.api.db.load_glottolog_data(Glottolog(str(glottolog))) return
def existing_lang(args): if not args.args: raise ParserError('No languoid specified') lang = args.repos.languoid(args.args[0]) if not lang: raise ParserError('Invalid languoid spec') return lang
def _get_dataset(args): if len(args.args) < 1: raise ParserError('not enough arguments') fname = Path(args.args[0]) if not fname.exists() or not fname.is_file(): raise ParserError('%s is not an existing directory' % fname) if fname.suffix == '.json': return Dataset.from_metadata(fname) return Dataset.from_data(fname)
def new_dataset(args): """ lexibank new-dataset OUTDIR [ID] """ if not args.args: raise ParserError('you must specify an existing directory') outdir = Path(args.args.pop(0)) if not outdir.exists(): raise ParserError('you must specify an existing directory') id_pattern = re.compile('[a-z_0-9]+$') md = {} if args.args: md['id'] = args.args.pop(0) else: md['id'] = input('Dataset ID: ') while not id_pattern.match(md['id']): print( 'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!' ) md['id'] = input('Dataset ID: ') outdir = outdir / md['id'] if not outdir.exists(): outdir.mkdir() for key in ['title', 'url', 'license', 'conceptlist', 'citation']: md[key] = input('Dataset {0}: '.format(key)) # check license! # check conceptlist! for path in Path( pylexibank.__file__).parent.joinpath('dataset_template').iterdir(): if path.is_file(): if path.suffix in ['.pyc']: continue # pragma: no cover target = path.name content = read_text(path) if '+' in path.name: target = re.sub('\+([a-z]+)\+', lambda m: '{' + m.groups()[0] + '}', path.name).format(**md) if target.endswith('_tmpl'): target = target[:-5] content = content.format(**md) write_text(outdir / target, content) else: target = outdir / path.name if target.exists(): shutil.rmtree(str(target)) shutil.copytree(str(path), str(target)) del md['id'] jsonlib.dump(md, outdir / 'metadata.json', indent=4)
def run(args): bipa = args.clts.api.bipa func = profile.simple_profile cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints'] kw = {'ref': 'form', 'clts': bipa} if args.context: func = profile.context_profile cols = [ 'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence', 'Codepoints' ] kw['col'] = 'language_id' ds = get_dataset(args) profile_path = ds.etc_dir / 'orthography.tsv' if profile_path.exists() and not args.force: raise ParserError( 'Orthography profile exists already. To overwrite, pass "-f" flag') header, D = [], {} for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1): if i == 1: header = [f for f in row.keys() if f != 'ID'] D = {0: ['lid'] + [h.lower() for h in header]} row['Segments'] = ' '.join(row['Segments']) D[i] = [row['ID']] + [row[h] for h in header] with UnicodeWriter(profile_path, delimiter='\t') as writer: writer.writerow(cols) for row in func(Wordlist(D, row='parameter_id', col='language_id'), **kw): writer.writerow(row) args.log.info('Orthography profile written to {0}'.format(profile_path))
def cmd(args): """ docstring """ if len(args.args) < 1: raise ParserError('not enough arguments') print(args.args[0])
def with_dataset(args: argparse.Namespace, func: typing.Union[callable, str], dataset=None) \ -> typing.Any: """ Run a callable, passing a dataset and `args` as arguments, returning it's result. :param args: CLI arguments :param func: Callable with suitable signature or `str`, in which case a method `_cmd_<name>` \ will be looked up on the dataset and run. :param dataset: `cldfbench.Dataset` instance or `None`, in which case a dataset will be \ retrieved as specified by `args`. """ dataset = dataset or get_dataset(args) s = time() arg = [dataset] if isinstance(func, str): func_ = getattr(dataset, '_cmd_' + func, getattr(dataset, 'cmd_' + func, None)) if not func_: raise ParserError('Dataset {0} has no {1} command'.format( dataset.id, func)) func, arg = func_, [] args.log.info('running {0} on {1} ...'.format( getattr(func, '__name__', func), dataset.id)) res = func(*arg, args) args.log.info('... done %s [%.1f secs]' % (dataset.id, time() - s)) return res
def get_dataset(args, name=None): name = name or args.args[0] dir_ = Path(name) if not is_dataset_dir(dir_): dir_ = data_path(name, repos=args.lexibank_repos) if not is_dataset_dir(dir_): raise ParserError('invalid dataset spec') return Dataset(dir_)
def get_dataset(args): ds = _get(args.dataset, ep=args.entry_point) if ds: return ds raise ParserError( termcolor.colored( '\nInvalid dataset spec: <{0}> {1}\n'.format( args.entry_point, args.dataset), "red"))
def newick(args): parser = argparse.ArgumentParser(prog='newick') parser.add_argument('root', nargs='?', default=None, help='root node') parser.add_argument('--template', help='node label template', default=None) xargs = parser.parse_args(args.args) if xargs.root and not args.repos.languoid(xargs.root): raise ParserError('Invalid root node {0}'.format(xargs.root)) sprint(args.repos.newick_tree(xargs.root, template=xargs.template))
def get_datasets(args): if args.glob or args.dataset == '_': args.dataset = args.dataset.replace('_', '*') res = _gets(args.dataset, ep=args.entry_point, glob=args.glob) if res: return res raise ParserError(termcolor.colored( '\nInvalid dataset spec: <{0}> {1}\n'.format(args.entry_point, args.dataset), "red"))
def check(args): """ clpa check <STRING> """ if len(args.args) != 1: raise ParserError('only one argument allowed') check = check_string(args.args[0], load_whitelist()) print('\t'.join(args.args[0].split(' '))) print('\t'.join(check))
def glottolog_(args): """Update data derived from Glottolog dplace glottolog PATH/TO/GLOTTOLOG/REPOS YEAR VERSION """ if len(args.args) != 3: raise ParserError('not enough arguments') year, version = args.args[1:3] title = "Glottolog {0}".format(version) glottolog.update(args.repos, args.args[0], year, title)
def run(args): try: fts.get_index(args.repos, must_exist=True) except ValueError: raise ParserError('Index does not exist. Run "glottolog searchindex" first!') count, results = fts.search(args.repos, args.query) with Table('ID', 'Author', 'Year', 'Title') as table: for res in results: table.append([res.id, res.author, res.year, res.title]) print('({} matches)'.format(count))
def tokenize(args): """ Tokenize a string (passed as argument or read from stdin) segments [--profile=PATH/TO/PROFILE] tokenize [STRING] """ if args.profile and not Path(args.profile).exists(): # pragma: no cover raise ParserError('--profile must be a path for an existing file') _write(args, Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
def configure(cfgpath=None): """ Configure lexibank. :return: a pair (config, logger) """ cfgpath = Path(cfgpath) \ if cfgpath else Path(user_config_dir(pylexibank.__name__)) / 'config.ini' if not cfgpath.exists(): print(""" {0} You seem to be running lexibank for the first time. Your system configuration will now be written to a config file to be used whenever lexibank is run lateron. """.format(colored('Welcome to lexibank!', 'blue', attrs=['bold', 'reverse']))) if not cfgpath.parent.exists(): cfgpath.parent.mkdir(parents=True) cfg = Config() cfg['paths'] = {k: get_path(src) for k, src in REPOS} cfg.write(cfgpath) print(""" Configuration has been written to: {0} You may edit this file to adapt to changes in your system or to reconfigure settings such as the logging level.""".format(cfgpath.resolve())) else: cfg = Config.from_file(cfgpath) try: cfg.glottolog except (FileNotFoundError, ValueError): raise ParserError( 'Misconfigured Glottolog path in {0}'.format(cfgpath)) if not Path(cfg['paths']['concepticon']).exists(): raise ParserError( 'Misconfigured Concepticon path in {0}'.format(cfgpath)) # Print the configuration directory for reference: print("Using configuration file at:") print(str(cfgpath) + '\n') return cfg
def run(args): # pragma: no cover try: args.version = assert_release(args.repos.repos) except AssertionError: raise ParserError('glottolog-data must be checked out at release tag!') _, settings = get_env_and_settings( str(args.pkg_dir.parent / 'development.ini')) with FreshDB.from_settings(settings, log=args.log): dbload(args) dbprime(args)
def with_dataset(args, func, dataset=None): dataset = dataset or get_dataset(args) s = time() arg = [dataset] if isinstance(func, str): func_ = getattr(dataset, '_cmd_' + func, getattr(dataset, 'cmd_' + func, None)) if not func_: raise ParserError('Dataset {0} has no {1} command'.format(dataset.id, func)) func, arg = func_, [] args.log.info('running {0} on {1} ...'.format(getattr(func, '__name__', func), dataset.id)) func(*arg, args) args.log.info('... done %s [%.1f secs]' % (dataset.id, time() - s))
def itemise(args): if len(args.args) != 1: raise ParserError("need a value to itemise") for ds in sorted(args.repos.datasets): d = args.repos.datasets[ds] try: dvalue = getattr(d, args.args[0]) except AttributeError: dvalue = d.details.get(args.args[0], None) print("%s = %s" % (ds.ljust(40), dvalue))
def run(args): if Glottocode.pattern.match(args.parent): args.parent = get_languoid(args, args.parent).dir else: args.parent = pathlib.Path(args.parent) if not args.parent.exists(): raise ParserError('invalid parent dir specified') lang = Languoid.from_name_id_level( args.parent, args.name, args.repos.glottocodes.new(args.name), args.level, **dict(prop.split('=') for prop in args.props)) print("Info written to %s" % lang.write_info(outdir=args.parent))
def get_dataset(args: argparse.Namespace) -> cldfbench.Dataset: """ Get the `cldfbench.Dataset` specified by `args`. :raises ParserError: If no matching dataset was found. """ ds = _get(args.dataset, ep=args.entry_point) if ds: return ds raise ParserError( termcolor.colored( '\nInvalid dataset spec: <{0}> {1}\n'.format( args.entry_point, args.dataset), "red"))
def get_datasets(args: argparse.Namespace) -> typing.List[cldfbench.Dataset]: """ Get the `cldfbench.Dataset` s specified by `args`. :raises ParserError: If no matching datasets were found. """ if args.glob or args.dataset == '_': args.dataset = args.dataset.replace('_', '*') res = _gets(args.dataset, ep=args.entry_point, glob=args.glob) if res: return res raise ParserError( termcolor.colored( '\nInvalid dataset spec: <{0}> {1}\n'.format( args.entry_point, args.dataset), "red"))
def recode(args): """Assign a new glottocode to an existing languoid. glottolog recode <code> """ lang = args.repos.languoid(args.args[0]) if not lang: raise ParserError('languoid not found') lang.id = Glottocode.from_name(lang.name) new_dir = lang.dir.parent.joinpath(lang.id) copytree(lang.dir, new_dir) lang.write_info(new_dir) remove(new_dir.joinpath('%s.ini' % args.args[0])) rmtree(lang.dir) print("%s -> %s" % (args.args[0], lang.id))
def link(args): """ Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added. concepticon link <concept-list> """ api = Concepticon(args.data) conceptlist = Path(args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): conceptlist = api.data_path('conceptlists', args.args[0]) if not conceptlist.exists() or not conceptlist.is_file(): raise ParserError('no file %s found' % args.args[0]) rewrite(conceptlist, Linker(conceptlist.stem, api.conceptsets.values()))
def register(parser): try: from cdstarcat.catalog import Catalog except ImportError: raise ParserError('pip install cdstarcat') parser.add_argument('version', help="version number without 'v' prefix") parser.add_argument('--catalog', type=pathlib.Path, default=pathlib.Path(os.environ['CDSTAR_CATALOG'])) parser.add_argument('--url', default=os.environ['CDSTAR_URL']) parser.add_argument('--user', default=os.environ['CDSTAR_USER']) parser.add_argument('--pwd', default=os.environ['CDSTAR_PWD']) parser.add_argument('--catalog_class', help=argparse.SUPPRESS, default=Catalog)
def createdb(args): """ cldf createdb <DATASET> <SQLITE_DB_PATH> Load CLDF dataset <DATASET> into a SQLite DB, where <DATASET> may be the path to - a CLDF metadata file - a CLDF core data file """ if len(args.args) < 2: raise ParserError('not enough arguments') db = Database(args.args[1]) db.create() ds = _get_dataset(args) db.load(ds) args.log.info('{0} loaded in {1}'.format(ds, db.fname))
def dbinit(args): """ glottolog-app dbinit VERSION """ if not args.args: raise ParserError('not enough arguments') args.log.info('dropping DB {0}'.format(DB)) try: subprocess.check_call(['dropdb', DB]) except subprocess.CalledProcessError: args.log.error( 'could not drop DB, maybe other processes are still accessing it.') return args.log.info('creating DB {0}'.format(DB)) subprocess.check_call(['createdb', DB]) dbload(args) dbprime(args)
def beast2chars(args): import xml.etree.ElementTree as ElementTree def find_filter(node): # note recursive for child in node: find_filter(child) (p, x, y) = get_partition(node) if p and x and y: return (p, x, y) def get_partition(p): x, y = [int(_) for _ in p.get('filter').split("-")] return (p.get('id'), x, y) def printchar(p, x, y, ascertained=False): n = 1 for i in range(x, y + 1): label = "%s-%s" % (p, 'ascertained' if n == 1 and ascertained else str(n)) print(i, label) n += 1 def get_by_id(data_id): if data_id.startswith("@"): data_id = data_id.lstrip("@") return xml.find(".//alignment[@id='%s']" % data_id) if len(args.args) != 1: raise ParserError("need an XML filename") xml = ElementTree.parse(args.args[0]) for treelh in xml.findall(".//distribution[@spec='TreeLikelihood']"): if treelh.get('data'): data = get_by_id(treelh.get('data')) ascertained = data.get('ascertained') == 'true' printchar(*get_partition(data.find('./data')), ascertained=ascertained) else: data = treelh.find('./data') ascertained = data.get('ascertained') == 'true' if data.get('data'): datadata = get_by_id(data.get('data')) else: datadata = treelh.find('./data/data') printchar(*get_partition(datadata), ascertained=ascertained)
def run(args): # Note: Due to https://github.com/concepticon/pyconcepticon/issues/10 we require specification # of an output file on Windows: if platform.system() == 'Windows' and not args.output: # pragma: no cover raise ParserError( 'On Windows you must specify an output file since printing to the terminal may ' 'not work') args.repos.map( get_conceptlist(args, path_only=True), otherlist=_get_conceptlist(args.reference_list, args, path_only=True) if args.reference_list else None, out=args.output, full_search=args.full_search, language=args.language, skip_multiple=args.skip_multimatch, )