def run(args): cfg = Config.from_file() for cat in BUILTIN_CATALOGS: name = cat.cli_name() print() print(termcolor.colored( '{0} - https://github.com/{1}'.format(name, cat.__github__), attrs=['bold', 'underline'])) print() path, from_cfg = getattr(args, name), False if (not path) and (not args.no_config): try: path, from_cfg = cfg.get_clone(name), True except KeyError as e: args.log.warning(str(e)) continue try: cat = cat(path) except ValueError as e: # pragma: no cover args.log.warning(str(e)) continue print_kv('local clone', cat.dir.resolve()) if from_cfg: print_kv('config at', cfg.fname()) print_kv('versions') for i, version in enumerate(iter_aligned(cat.iter_versions(), prefix=' ')): if i < args.max_versions: print(version) if cat.__api__: print_kv('API', '{0.__name__} {0.__version__}'.format(cat.__api_pkg__)) print()
def run(args): with Config.from_file() as cfg: for cat in BUILTIN_CATALOGS: val = getattr(args, cat.cli_name()) if not val: if cat.default_location().exists(): # pragma: no cover val = cat(cat.default_location()).dir args.log.info( 'CLone of {0} exists at {1} - skipping'.format( cat.__github__, cat.default_location())) elif args.quiet or confirm('clone {0}?'.format(cat.__github__), default=False): # pragma: no cover url = 'https://github.com/{0}.git'.format(cat.__github__) args.log.info('Cloning {0} into {1} ...'.format( url, cat.default_location())) val = cat.clone(url).dir args.log.info('... done') else: try: cat(val) except ValueError as e: # pragma: no cover args.log.warning(str(e)) if val: cfg.add_clone(cat.cli_name(), val) args.log.info('Config written to {0}'.format(cfg.fname()))
def run(args): ds = Dataset() concepticon = Concepticon(Config.from_file().get_clone('concepticon')) wl = Wordlist.from_cldf( ds.dir.joinpath('cldf', 'cldf-metadata.json'), ) # languages languages = [ "EasternLuobuohe", "WesternLuobuohe", "Chuanqiandian", "CentralGuizhouChuanqiandian", "WesternXiangxi", "EasternXiangxi", "Bana", "Younuo", "Numao", "EasternBahen", "WesternBaheng", "EasternQiandong", "WesternQiandong", "BiaoMin", "ZaoMin"] # concepts concepts = set() for clist in [ 'Blust-2008-210', 'Swadesh-1952-200', 'Swadesh-1955-100', 'Comrie-1977-207', 'Matisoff-1978-200', 'Sagart-2019-250', 'Liu-2007-201', 'SoHartmann-1988-280', 'BeijingDaxue-1964-905', ]: for concept in concepticon.conceptlists[clist].concepts.values(): if concept.concepticon_id: concepts.add(concept.concepticon_id) D = {0: wl.columns} for idx, doculect, cid in wl.iter_rows('doculect', 'concepticon'): if doculect in languages and cid in concepts: D[idx] = wl[idx] # revise columns commend wl = Wordlist(D) wl.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_subset').as_posix(), prettify=False, ignore='all') args.log.info('Wordlist has {0} concepts and {1} varieties across {2} words.'.format( wl.height, wl.width, len(wl))) # print statistics on coverage table = [[doculect, items, items/wl.height] for doculect, items in wl.coverage().items()] print(tabulate(table, headers=['Doculect', 'Words', 'Coverage'], tablefmt='pipe', floatfmt='.2f'))
def __call__(self, args): if args.cldf: wl = lingpy.basic.wordlist.Wordlist.from_cldf(args.input_file) else: wl = lingpy.basic.wordlist.Wordlist(args.input_file) # check for count in string count = 'NO\t' if args.count else '' if args.context: out = [ count + 'Grapheme\tIPA\tEXAMPLES\tLANGUAGES\tFREQUENCY\tCODEPOINTS' ] function = lingpy.sequence.profile.context_profile else: out = [count + 'Grapheme\tIPA\tFREQUENCY\tCODEPOINTS'] function = lingpy.sequence.profile.simple_profile if args.column.lower() not in wl.header: raise ValueError("Wrong column header specified!") if args.clts: try: from pyclts import CLTS from cldfcatalog import Config clts = CLTS(Config.from_file().get_clone('clts')).bipa except ImportError: raise ImportError( "Module pyclts is not installed on your system") else: clts = False # convert to lower case to make sure it's working column = args.column.lower() if args.language: D = {0: [h for h in sorted(wl.header, key=lambda x: wl.header[x])]} for idx in wl.get_list(col=args.language, flat=True): D[idx] = wl[idx] wl = lingpy.basic.wordlist.Wordlist(D) if args.context: for line in lingpy.sequence.profile.context_profile( wl, ref=args.column, clts=clts, merge_vowels=args.merge): out += ['\t'.join(line)] else: for line in lingpy.sequence.profile.simple_profile( wl, ref=args.column, clts=clts, merge_vowels=args.merge): out += ['\t'.join(line)] if args.output_file == 'stdout': print(out[0]) for i, line in enumerate(out[1:]): if args.count: print(str(i + 1) + '\t' + line) else: print(line) else: lingpy.util.write_text_file(args.output_file, out) return len(out) - 1
def _main(commands, args=None, catch_all=False, parsed_args=None, log=None, test=False): try: repos = Config.from_file().get_clone('glottolog') except KeyError: # pragma: no cover repos = pathlib.Path('.') parser, subparsers = get_parser_and_subparsers('glottolog') parser.add_argument('--repos', help="clone of glottolog/glottolog", default=repos, type=pathlib.Path) parser.add_argument( '--repos-version', help="version of repository data. Requires a git clone!", default=None) parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pathlib.Path(__file__).parent) register_subcommands(subparsers, commands) args = parsed_args or parser.parse_args(args=args) args.test = test if not hasattr(args, "main"): parser.print_help() return 1 with contextlib.ExitStack() as stack: if not log: # pragma: no cover stack.enter_context(Logging(args.log, level=args.log_level)) else: args.log = log if args.repos_version: # pragma: no cover # If a specific version of the data is to be used, we make # use of a Catalog as context manager: stack.enter_context(Catalog(args.repos, tag=args.repos_version)) try: args.repos = Glottolog(args.repos) except Exception as e: print(e) return _main(commands, args=[args._command, '-h']) args.log.info('glottolog/glottolog at {0}'.format(args.repos.repos)) try: return args.main(args) or 0 except KeyboardInterrupt: # pragma: no cover return 0 except ParserError as e: print(e) return _main(commands, args=[args._command, '-h']) except Exception as e: # pragma: no cover if catch_all: print(e) return 1 raise
def test_catalog_from_config(glottolog_dir, tmpds, mocker, tmpdir, fixtures_dir): from cldfcatalog import Config # First case: get a "good" value from comfig: mocker.patch( 'cldfcatalog.config.appdirs', mocker.Mock(user_config_dir=mocker.Mock(return_value=str(tmpdir)))) mocker.patch('cldfbench.commands.catconfig.confirm', mocker.Mock(return_value=False)) cli.main(['catconfig', '--glottolog', str(glottolog_dir)]) cli.main(['catinfo']) # Second case: get an invalid path from config: with Config.from_file() as cfg: cfg.add_clone('glottolog', fixtures_dir) with pytest.raises(SystemExit): cli.main(['makecldf', tmpds])
def main(args=None, catch_all=False, parsed_args=None): try: # pragma: no cover repos = Config.from_file().get_clone('concepticon') except KeyError: # pragma: no cover repos = pathlib.Path('.') parser, subparsers = get_parser_and_subparsers('norare') parser.add_argument('--repos', help="clone of concepticon/concepticon-data", default=repos, type=PathType(type='dir')) parser.add_argument( '--repos-version', help="version of repository data. Requires a git clone!", default=None) parser.add_argument('--norarepo', default=pathlib.Path('.'), type=PathType(type='dir')) register_subcommands(subparsers, pynorare.commands) args = parsed_args or parser.parse_args(args=args) if not hasattr(args, "main"): # pragma: no cover parser.print_help() return 1 with contextlib.ExitStack() as stack: stack.enter_context(Logging(args.log, level=args.log_level)) if args.repos_version: # pragma: no cover # If a specific version of the data is to be used, we make # use of a Catalog as context manager: stack.enter_context(Catalog(args.repos, tag=args.repos_version)) args.repos = Concepticon(args.repos) args.api = NoRaRe(args.norarepo, concepticon=args.repos) args.log.info('norare at {0}'.format(args.repos.repos)) try: return args.main(args) or 0 except KeyboardInterrupt: # pragma: no cover return 0 except ParserError as e: # pragma: no cover print(e) return main([args._command, '-h']) except Exception as e: # pragma: no cover if catch_all: # pragma: no cover print(e) return 1 raise
def run(args): cfg = Config.from_file() for cat in BUILTIN_CATALOGS: name = cat.cli_name() path = getattr(args, name) if (not path) and (not args.no_config): # pragma: no cover try: path = cfg.get_clone(name) except KeyError as e: args.log.warning(str(e)) continue if path: try: cat = cat(path) except ValueError as e: # pragma: no cover args.log.warning(str(e)) continue for fetch_info in cat.update(): # pragma: no cover args.log.info('{0}: fetch {1.ref} {1.note}'.format( name, fetch_info))
def get_mappings(concepticon=None): concepticon = concepticon or Concepticon( Config.from_file().get_clone('concepticon')) paths = { p.stem.split('-')[1]: p for p in concepticon.repos.joinpath('mappings').glob('map-*.tsv') } mappings = {} for language, path in paths.items(): mappings[language] = collections.defaultdict(set) for line in reader(path, delimiter='\t', dicts=True): gloss = line['GLOSS'].split('///')[1] oc = concepticon.conceptsets[line['ID']].ontological_category mappings[language][gloss].add( (line['ID'], int(line['PRIORITY']), oc)) for language, path in paths.items(): for k, v in mappings[language].items(): # We sort concepticon matches for a given gloss by descending priority and ascending # Concepticon ID. mappings[language][k] = sorted(v, key=lambda x: (x[1], -int(x[0])), reverse=True) return mappings, concepticon
from pysen.glosses import to_concepticon from pyconcepticon import Concepticon from cldfcatalog import Config from statistics import mean repos = Config.from_file().get_clone('concepticon') concepticon = Concepticon(repos) results = [] for lst in concepticon.conceptlists.values(): concepts = [] if 'chinese' in lst.source_language: for concept in lst.concepts.values(): concepts += [{ 'concept': concept.attributes['chinese'], 'concepticon_id': concept.concepticon_id }] mappings = to_concepticon(concepts, gloss_ref='concept', language='zh') hits, total = 0, 0 for concept in concepts: cid = concept['concepticon_id'] tids = mappings[concept['concept']] scores = [] for tid in tids: if tid[0] == cid: scores += [1] else: scores += [0] if scores:
'Tai-Kadai', 'Tupian', 'Turkic', 'Uralic', 'Uto-Aztecan', ] GLOTTOLOG_CODE_UPDATE = { 'itsa1239': 'icar1234', 'east2283': 'nucl1235', 'ngar1286': 'yinh1234', } # try to load glottolog cfg = Config.from_file() try: gdir = cfg['clones'].get('glottolog', None) except: gdir = None if not gdir: raise RuntimeError("Unable to find glottolog dir. Please run `cldfbench catconfig`") class Dataset: def __init__(self, label, files): self.label = label self.files = files # load data self.data = list(self.load(self._getfile('.txt')))
""" Create a subselection of doculects and concepts from the dataset of Chén (2012). """ from lexibank_chenhmongmien import Dataset from lingpy import * from pyconcepticon import Concepticon from cldfcatalog import Config from tabulate import tabulate from sys import argv ds = Dataset() concepticon = Concepticon(Config.from_file().get_clone('concepticon')) wl = Wordlist.from_cldf(ds.dir.joinpath('cldf', 'cldf-metadata.json'), ) # languages languages = [ "EasternLuobuohe", "WesternLuobuohe", "Chuanqiandian", "CentralGuizhouChuanqiandian", "WesternXiangxi", "EasternXiangxi", "Bana", "Younuo", "Numao", "EasternBahen", "WesternBaheng", "EasternQiandong", "WesternQiandong", "BiaoMin", "ZaoMin" ] # concepts concepts = set() for clist in [ 'Blust-2008-210', 'Swadesh-1952-200', 'Swadesh-1955-100', 'Comrie-1977-207', 'Matisoff-1978-200',
def main(args=None, catch_all=False, parsed_args=None, log=None): parser, subparsers = get_parser_and_subparsers(cldfbench.__name__) # We add a "hidden" option to turn-off config file reading in tests: parser.add_argument('--no-config', default=False, action='store_true', help=argparse.SUPPRESS) add_csv_field_size_limit(parser, default=csv.field_size_limit()) # Discover available commands: # Commands are identified by (<entry point name>).<module name> register_subcommands(subparsers, cldfbench.commands, entry_point='cldfbench.commands') args = parsed_args or parser.parse_args(args=args) if not hasattr(args, "main"): parser.print_help() return 1 with contextlib.ExitStack() as stack: if not log: # pragma: no cover stack.enter_context(Logging(args.log, level=args.log_level)) else: args.log = log # args.no_catalogs is set by the `config` command, because this command specifies # catalog options **optionally**, and prompts for user input only in its `run` function. if not getattr(args, "no_catalogs", False): cfg = Config.from_file() for cls in BUILTIN_CATALOGS: # Now we loop over known catalogs, see whether they are used by the command, # and if so, "enter" the catalog. name, from_cfg = cls.cli_name(), False if hasattr(args, name): # If no path was passed on the command line, we look up the config: path = getattr(args, name) if (not path) and (not args.no_config): try: path = cfg.get_clone(name) from_cfg = True except KeyError as e: # pragma: no cover print(termcolor.colored(str(e) + '\n', 'red')) return main([args._command, '-h']) try: setattr( args, name, stack.enter_context( cls(path, getattr(args, name + '_version', None))), ) except ValueError as e: print( termcolor.colored( '\nError initializing catalog {0}'.format( name), 'red')) if from_cfg: print( termcolor.colored( 'from config {0}'.format(cfg.fname()), 'red')) print(termcolor.colored(str(e) + '\n', 'red')) return main([args._command, '-h']) try: return args.main(args) or 0 except KeyboardInterrupt: # pragma: no cover return 0 except ParserError as e: print( termcolor.colored('ERROR: {}\n'.format(e), 'red', attrs={'bold'})) return main([args._command, '-h']) except Exception as e: if catch_all: # pragma: no cover print( termcolor.colored('ERROR: {}\n'.format(e), 'red', attrs={'bold'})) return 1 raise
def catalog(name, args): repos = getattr(args, name) or Config.from_file().get_clone(name) if not repos: # pragma: no cover raise argparse.ArgumentError( None, 'No repository specified for {0} and no config found'.format(name)) return CATALOGS[name](repos, getattr(args, name + '_version'))
def __init__(self, repos=None, datasets=None, concepticon=None): API.__init__(self, repos) self.datasets = datasets or collections.OrderedDict() concepticon = concepticon if not concepticon: # pragma: no cover try: concepticon = Concepticon( Config.from_file().get_clone('concepticon')) except KeyError: pass datasets = set() self.annotations = collections.defaultdict( lambda: collections.OrderedDict()) for row in reader(self.repos / 'norare.tsv', delimiter='\t', dicts=True): self.annotations[row['DATASET']][row['NAME'].lower()] = { k.lower(): row[k] for k in [ 'DATASET', 'NAME', 'LANGUAGE', 'STRUCTURE', 'TYPE', 'NORARE', 'RATING', 'SOURCE', 'OTHER', 'NOTE' ] } datasets.add(row['DATASET']) # get bibliography self.refs = collections.OrderedDict() with self.repos.joinpath( 'references', 'references.bib').open(encoding='utf-8') as fp: for key, entry in pybtex.database.parse_string( fp.read(), bib_format='bibtex').entries.items(): self.refs[key] = Source.from_entry(key, entry) all_refs = set(self.refs) if concepticon: all_refs = all_refs.union(concepticon.bibliography) for row in reader(self.repos / 'concept_set_meta.tsv', delimiter='\t', dicts=True): row['norare'] = self row['path'] = self.repos.joinpath('concept_set_meta', row['ID'], row['ID'] + '.tsv-metadata.json') self.datasets[row['ID']] = ConceptSetMeta( **{k.lower(): v for k, v in row.items()}) self.datasets[row['ID']].source_language = [ lg.lower().strip() for lg in self.datasets[row['ID']].source_language.split(',') ] # remaining datasets come from concepticon, we identify them from datasets concepticon_datasets = [d for d in datasets if d not in self.datasets] for dataset in concepticon_datasets: ds = concepticon.conceptlists[dataset] self.datasets[ds.id] = ConceptSetMeta( id=ds.id, author=ds.author, year=ds.year, tags=', '.join(ds.tags), source_language=ds.source_language, target_language=ds.target_language, url=ds.url, refs=ds.refs, note=ds.note, alias=ds.alias, norare=self, path=concepticon.repos.joinpath('concepticondata', 'conceptlists', ds.id + '.tsv-metadata.json')) for dataset in self.datasets.values(): if dataset.refs: refs = [dataset.refs] if isinstance(dataset.refs, str) else dataset.refs for ref in refs: if ref not in all_refs: # pragma: no cover raise ValueError( 'missing references.bib: {}'.format(ref))