Beispiel #1
0
def run(args):
    cfg = Config.from_file()
    for cat in BUILTIN_CATALOGS:
        name = cat.cli_name()

        print()
        print(termcolor.colored(
            '{0} - https://github.com/{1}'.format(name, cat.__github__),
            attrs=['bold', 'underline']))
        print()

        path, from_cfg = getattr(args, name), False
        if (not path) and (not args.no_config):
            try:
                path, from_cfg = cfg.get_clone(name), True
            except KeyError as e:
                args.log.warning(str(e))
                continue

        try:
            cat = cat(path)
        except ValueError as e:  # pragma: no cover
            args.log.warning(str(e))
            continue

        print_kv('local clone', cat.dir.resolve())
        if from_cfg:
            print_kv('config at', cfg.fname())
        print_kv('versions')
        for i, version in enumerate(iter_aligned(cat.iter_versions(), prefix='  ')):
            if i < args.max_versions:
                print(version)
        if cat.__api__:
            print_kv('API', '{0.__name__} {0.__version__}'.format(cat.__api_pkg__))
        print()
Beispiel #2
0
def run(args):
    with Config.from_file() as cfg:
        for cat in BUILTIN_CATALOGS:
            val = getattr(args, cat.cli_name())
            if not val:
                if cat.default_location().exists():  # pragma: no cover
                    val = cat(cat.default_location()).dir
                    args.log.info(
                        'CLone of {0} exists at {1} - skipping'.format(
                            cat.__github__, cat.default_location()))
                elif args.quiet or confirm('clone {0}?'.format(cat.__github__),
                                           default=False):  # pragma: no cover
                    url = 'https://github.com/{0}.git'.format(cat.__github__)
                    args.log.info('Cloning {0} into {1} ...'.format(
                        url, cat.default_location()))
                    val = cat.clone(url).dir
                    args.log.info('... done')
            else:
                try:
                    cat(val)
                except ValueError as e:  # pragma: no cover
                    args.log.warning(str(e))
            if val:
                cfg.add_clone(cat.cli_name(), val)

    args.log.info('Config written to {0}'.format(cfg.fname()))
Beispiel #3
0
def run(args):
    
    ds = Dataset()
    concepticon = Concepticon(Config.from_file().get_clone('concepticon'))
    wl = Wordlist.from_cldf(
            ds.dir.joinpath('cldf', 'cldf-metadata.json'),
            )
    
    # languages
    languages = [
            "EasternLuobuohe",
            "WesternLuobuohe",
            "Chuanqiandian",
            "CentralGuizhouChuanqiandian",
            "WesternXiangxi",
            "EasternXiangxi",
            "Bana",
            "Younuo",
            "Numao",
            "EasternBahen",
            "WesternBaheng",
            "EasternQiandong",
            "WesternQiandong",
            "BiaoMin",
            "ZaoMin"]  
    
    # concepts
    concepts = set()
    for clist in [
            'Blust-2008-210', 
            'Swadesh-1952-200', 
            'Swadesh-1955-100',
            'Comrie-1977-207', 
            'Matisoff-1978-200',
            'Sagart-2019-250',
            'Liu-2007-201',
            'SoHartmann-1988-280',
            'BeijingDaxue-1964-905',
            ]:
        for concept in concepticon.conceptlists[clist].concepts.values():
                if concept.concepticon_id:
                    concepts.add(concept.concepticon_id)
    
    
    D = {0: wl.columns}
    for idx, doculect, cid in wl.iter_rows('doculect', 'concepticon'):
        if doculect in languages and cid in concepts:
            D[idx] = wl[idx]
    
    # revise columns commend
    wl = Wordlist(D)
    wl.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_subset').as_posix(),
            prettify=False, ignore='all')
    args.log.info('Wordlist has {0} concepts and {1} varieties across {2} words.'.format(
          wl.height, wl.width, len(wl)))
    
    # print statistics on coverage
    table = [[doculect, items, items/wl.height] for doculect, items in wl.coverage().items()]
    print(tabulate(table, headers=['Doculect', 'Words', 'Coverage'],
        tablefmt='pipe', floatfmt='.2f'))
Beispiel #4
0
    def __call__(self, args):
        if args.cldf:
            wl = lingpy.basic.wordlist.Wordlist.from_cldf(args.input_file)
        else:
            wl = lingpy.basic.wordlist.Wordlist(args.input_file)

        # check for count in string
        count = 'NO\t' if args.count else ''

        if args.context:
            out = [
                count +
                'Grapheme\tIPA\tEXAMPLES\tLANGUAGES\tFREQUENCY\tCODEPOINTS'
            ]
            function = lingpy.sequence.profile.context_profile
        else:
            out = [count + 'Grapheme\tIPA\tFREQUENCY\tCODEPOINTS']
            function = lingpy.sequence.profile.simple_profile
        if args.column.lower() not in wl.header:
            raise ValueError("Wrong column header specified!")
        if args.clts:
            try:
                from pyclts import CLTS
                from cldfcatalog import Config
                clts = CLTS(Config.from_file().get_clone('clts')).bipa
            except ImportError:
                raise ImportError(
                    "Module pyclts is not installed on your system")
        else:
            clts = False

        # convert to lower case to make sure it's working
        column = args.column.lower()

        if args.language:
            D = {0: [h for h in sorted(wl.header, key=lambda x: wl.header[x])]}
            for idx in wl.get_list(col=args.language, flat=True):
                D[idx] = wl[idx]
            wl = lingpy.basic.wordlist.Wordlist(D)
        if args.context:
            for line in lingpy.sequence.profile.context_profile(
                    wl, ref=args.column, clts=clts, merge_vowels=args.merge):
                out += ['\t'.join(line)]
        else:
            for line in lingpy.sequence.profile.simple_profile(
                    wl, ref=args.column, clts=clts, merge_vowels=args.merge):
                out += ['\t'.join(line)]
        if args.output_file == 'stdout':
            print(out[0])
            for i, line in enumerate(out[1:]):
                if args.count:
                    print(str(i + 1) + '\t' + line)
                else:
                    print(line)
        else:
            lingpy.util.write_text_file(args.output_file, out)

        return len(out) - 1
Beispiel #5
0
def _main(commands,
          args=None,
          catch_all=False,
          parsed_args=None,
          log=None,
          test=False):
    try:
        repos = Config.from_file().get_clone('glottolog')
    except KeyError:  # pragma: no cover
        repos = pathlib.Path('.')
    parser, subparsers = get_parser_and_subparsers('glottolog')
    parser.add_argument('--repos',
                        help="clone of glottolog/glottolog",
                        default=repos,
                        type=pathlib.Path)
    parser.add_argument(
        '--repos-version',
        help="version of repository data. Requires a git clone!",
        default=None)
    parser.add_argument('--pkg-dir',
                        help=argparse.SUPPRESS,
                        default=pathlib.Path(__file__).parent)
    register_subcommands(subparsers, commands)

    args = parsed_args or parser.parse_args(args=args)
    args.test = test

    if not hasattr(args, "main"):
        parser.print_help()
        return 1

    with contextlib.ExitStack() as stack:
        if not log:  # pragma: no cover
            stack.enter_context(Logging(args.log, level=args.log_level))
        else:
            args.log = log
        if args.repos_version:  # pragma: no cover
            # If a specific version of the data is to be used, we make
            # use of a Catalog as context manager:
            stack.enter_context(Catalog(args.repos, tag=args.repos_version))
        try:
            args.repos = Glottolog(args.repos)
        except Exception as e:
            print(e)
            return _main(commands, args=[args._command, '-h'])
        args.log.info('glottolog/glottolog at {0}'.format(args.repos.repos))
        try:
            return args.main(args) or 0
        except KeyboardInterrupt:  # pragma: no cover
            return 0
        except ParserError as e:
            print(e)
            return _main(commands, args=[args._command, '-h'])
        except Exception as e:  # pragma: no cover
            if catch_all:
                print(e)
                return 1
            raise
Beispiel #6
0
def test_catalog_from_config(glottolog_dir, tmpds, mocker, tmpdir, fixtures_dir):
    from cldfcatalog import Config

    # First case: get a "good" value from comfig:
    mocker.patch(
        'cldfcatalog.config.appdirs',
        mocker.Mock(user_config_dir=mocker.Mock(return_value=str(tmpdir))))
    mocker.patch('cldfbench.commands.catconfig.confirm', mocker.Mock(return_value=False))
    cli.main(['catconfig', '--glottolog', str(glottolog_dir)])
    cli.main(['catinfo'])

    # Second case: get an invalid path from config:
    with Config.from_file() as cfg:
        cfg.add_clone('glottolog', fixtures_dir)
    with pytest.raises(SystemExit):
        cli.main(['makecldf', tmpds])
Beispiel #7
0
def main(args=None, catch_all=False, parsed_args=None):
    try:  # pragma: no cover
        repos = Config.from_file().get_clone('concepticon')
    except KeyError:  # pragma: no cover
        repos = pathlib.Path('.')

    parser, subparsers = get_parser_and_subparsers('norare')
    parser.add_argument('--repos',
                        help="clone of concepticon/concepticon-data",
                        default=repos,
                        type=PathType(type='dir'))
    parser.add_argument(
        '--repos-version',
        help="version of repository data. Requires a git clone!",
        default=None)
    parser.add_argument('--norarepo',
                        default=pathlib.Path('.'),
                        type=PathType(type='dir'))

    register_subcommands(subparsers, pynorare.commands)

    args = parsed_args or parser.parse_args(args=args)
    if not hasattr(args, "main"):  # pragma: no cover
        parser.print_help()
        return 1

    with contextlib.ExitStack() as stack:
        stack.enter_context(Logging(args.log, level=args.log_level))
        if args.repos_version:  # pragma: no cover
            # If a specific version of the data is to be used, we make
            # use of a Catalog as context manager:
            stack.enter_context(Catalog(args.repos, tag=args.repos_version))
        args.repos = Concepticon(args.repos)
        args.api = NoRaRe(args.norarepo, concepticon=args.repos)
        args.log.info('norare at {0}'.format(args.repos.repos))
        try:
            return args.main(args) or 0
        except KeyboardInterrupt:  # pragma: no cover
            return 0
        except ParserError as e:  # pragma: no cover
            print(e)
            return main([args._command, '-h'])
        except Exception as e:  # pragma: no cover
            if catch_all:  # pragma: no cover
                print(e)
                return 1
            raise
Beispiel #8
0
def run(args):
    cfg = Config.from_file()
    for cat in BUILTIN_CATALOGS:
        name = cat.cli_name()
        path = getattr(args, name)
        if (not path) and (not args.no_config):  # pragma: no cover
            try:
                path = cfg.get_clone(name)
            except KeyError as e:
                args.log.warning(str(e))
                continue

        if path:
            try:
                cat = cat(path)
            except ValueError as e:  # pragma: no cover
                args.log.warning(str(e))
                continue
            for fetch_info in cat.update():  # pragma: no cover
                args.log.info('{0}: fetch {1.ref} {1.note}'.format(
                    name, fetch_info))
Beispiel #9
0
def get_mappings(concepticon=None):
    concepticon = concepticon or Concepticon(
        Config.from_file().get_clone('concepticon'))
    paths = {
        p.stem.split('-')[1]: p
        for p in concepticon.repos.joinpath('mappings').glob('map-*.tsv')
    }
    mappings = {}
    for language, path in paths.items():
        mappings[language] = collections.defaultdict(set)
        for line in reader(path, delimiter='\t', dicts=True):
            gloss = line['GLOSS'].split('///')[1]
            oc = concepticon.conceptsets[line['ID']].ontological_category
            mappings[language][gloss].add(
                (line['ID'], int(line['PRIORITY']), oc))
    for language, path in paths.items():
        for k, v in mappings[language].items():
            # We sort concepticon matches for a given gloss by descending priority and ascending
            # Concepticon ID.
            mappings[language][k] = sorted(v,
                                           key=lambda x: (x[1], -int(x[0])),
                                           reverse=True)
    return mappings, concepticon
Beispiel #10
0
from pysen.glosses import to_concepticon
from pyconcepticon import Concepticon
from cldfcatalog import Config
from statistics import mean

repos = Config.from_file().get_clone('concepticon')
concepticon = Concepticon(repos)

results = []
for lst in concepticon.conceptlists.values():
    concepts = []
    if 'chinese' in lst.source_language:

        for concept in lst.concepts.values():
            concepts += [{
                'concept': concept.attributes['chinese'],
                'concepticon_id': concept.concepticon_id
            }]

        mappings = to_concepticon(concepts, gloss_ref='concept', language='zh')
        hits, total = 0, 0
        for concept in concepts:
            cid = concept['concepticon_id']
            tids = mappings[concept['concept']]
            scores = []
            for tid in tids:
                if tid[0] == cid:
                    scores += [1]
                else:
                    scores += [0]
            if scores:
Beispiel #11
0
    'Tai-Kadai',
    'Tupian',
    'Turkic',
    'Uralic',
    'Uto-Aztecan',
]

GLOTTOLOG_CODE_UPDATE = {
    'itsa1239': 'icar1234',
    'east2283': 'nucl1235',
    'ngar1286': 'yinh1234',
}


# try to load glottolog
cfg = Config.from_file()
try:
    gdir = cfg['clones'].get('glottolog', None)
except:
    gdir = None
    
if not gdir:
    raise RuntimeError("Unable to find glottolog dir. Please run `cldfbench catconfig`")


class Dataset:
    def __init__(self, label, files):
        self.label = label
        self.files = files
        # load data
        self.data = list(self.load(self._getfile('.txt')))
Beispiel #12
0
"""
Create a subselection of doculects and concepts from the dataset of Chén (2012).
"""
from lexibank_chenhmongmien import Dataset
from lingpy import *
from pyconcepticon import Concepticon
from cldfcatalog import Config
from tabulate import tabulate

from sys import argv

ds = Dataset()
concepticon = Concepticon(Config.from_file().get_clone('concepticon'))
wl = Wordlist.from_cldf(ds.dir.joinpath('cldf', 'cldf-metadata.json'), )

# languages
languages = [
    "EasternLuobuohe", "WesternLuobuohe", "Chuanqiandian",
    "CentralGuizhouChuanqiandian", "WesternXiangxi", "EasternXiangxi", "Bana",
    "Younuo", "Numao", "EasternBahen", "WesternBaheng", "EasternQiandong",
    "WesternQiandong", "BiaoMin", "ZaoMin"
]

# concepts
concepts = set()
for clist in [
        'Blust-2008-210',
        'Swadesh-1952-200',
        'Swadesh-1955-100',
        'Comrie-1977-207',
        'Matisoff-1978-200',
Beispiel #13
0
def main(args=None, catch_all=False, parsed_args=None, log=None):
    parser, subparsers = get_parser_and_subparsers(cldfbench.__name__)

    # We add a "hidden" option to turn-off config file reading in tests:
    parser.add_argument('--no-config',
                        default=False,
                        action='store_true',
                        help=argparse.SUPPRESS)
    add_csv_field_size_limit(parser, default=csv.field_size_limit())

    # Discover available commands:
    # Commands are identified by (<entry point name>).<module name>
    register_subcommands(subparsers,
                         cldfbench.commands,
                         entry_point='cldfbench.commands')

    args = parsed_args or parser.parse_args(args=args)
    if not hasattr(args, "main"):
        parser.print_help()
        return 1

    with contextlib.ExitStack() as stack:
        if not log:  # pragma: no cover
            stack.enter_context(Logging(args.log, level=args.log_level))
        else:
            args.log = log
        # args.no_catalogs is set by the `config` command, because this command specifies
        # catalog options **optionally**, and prompts for user input only in its `run` function.
        if not getattr(args, "no_catalogs", False):
            cfg = Config.from_file()
            for cls in BUILTIN_CATALOGS:
                # Now we loop over known catalogs, see whether they are used by the command,
                # and if so, "enter" the catalog.
                name, from_cfg = cls.cli_name(), False
                if hasattr(args, name):
                    # If no path was passed on the command line, we look up the config:
                    path = getattr(args, name)
                    if (not path) and (not args.no_config):
                        try:
                            path = cfg.get_clone(name)
                            from_cfg = True
                        except KeyError as e:  # pragma: no cover
                            print(termcolor.colored(str(e) + '\n', 'red'))
                            return main([args._command, '-h'])
                    try:
                        setattr(
                            args,
                            name,
                            stack.enter_context(
                                cls(path, getattr(args, name + '_version',
                                                  None))),
                        )
                    except ValueError as e:
                        print(
                            termcolor.colored(
                                '\nError initializing catalog {0}'.format(
                                    name), 'red'))
                        if from_cfg:
                            print(
                                termcolor.colored(
                                    'from config {0}'.format(cfg.fname()),
                                    'red'))
                        print(termcolor.colored(str(e) + '\n', 'red'))
                        return main([args._command, '-h'])

        try:
            return args.main(args) or 0
        except KeyboardInterrupt:  # pragma: no cover
            return 0
        except ParserError as e:
            print(
                termcolor.colored('ERROR: {}\n'.format(e),
                                  'red',
                                  attrs={'bold'}))
            return main([args._command, '-h'])
        except Exception as e:
            if catch_all:  # pragma: no cover
                print(
                    termcolor.colored('ERROR: {}\n'.format(e),
                                      'red',
                                      attrs={'bold'}))
                return 1
            raise
Beispiel #14
0
def catalog(name, args):
    repos = getattr(args, name) or Config.from_file().get_clone(name)
    if not repos:  # pragma: no cover
        raise argparse.ArgumentError(
            None, 'No repository specified for {0} and no config found'.format(name))
    return CATALOGS[name](repos, getattr(args, name + '_version'))
Beispiel #15
0
    def __init__(self, repos=None, datasets=None, concepticon=None):
        API.__init__(self, repos)
        self.datasets = datasets or collections.OrderedDict()

        concepticon = concepticon
        if not concepticon:  # pragma: no cover
            try:
                concepticon = Concepticon(
                    Config.from_file().get_clone('concepticon'))
            except KeyError:
                pass

        datasets = set()
        self.annotations = collections.defaultdict(
            lambda: collections.OrderedDict())
        for row in reader(self.repos / 'norare.tsv',
                          delimiter='\t',
                          dicts=True):
            self.annotations[row['DATASET']][row['NAME'].lower()] = {
                k.lower(): row[k]
                for k in [
                    'DATASET', 'NAME', 'LANGUAGE', 'STRUCTURE', 'TYPE',
                    'NORARE', 'RATING', 'SOURCE', 'OTHER', 'NOTE'
                ]
            }
            datasets.add(row['DATASET'])

        # get bibliography
        self.refs = collections.OrderedDict()
        with self.repos.joinpath(
                'references', 'references.bib').open(encoding='utf-8') as fp:
            for key, entry in pybtex.database.parse_string(
                    fp.read(), bib_format='bibtex').entries.items():
                self.refs[key] = Source.from_entry(key, entry)

        all_refs = set(self.refs)
        if concepticon:
            all_refs = all_refs.union(concepticon.bibliography)

        for row in reader(self.repos / 'concept_set_meta.tsv',
                          delimiter='\t',
                          dicts=True):
            row['norare'] = self
            row['path'] = self.repos.joinpath('concept_set_meta', row['ID'],
                                              row['ID'] + '.tsv-metadata.json')
            self.datasets[row['ID']] = ConceptSetMeta(
                **{k.lower(): v
                   for k, v in row.items()})
            self.datasets[row['ID']].source_language = [
                lg.lower().strip()
                for lg in self.datasets[row['ID']].source_language.split(',')
            ]

        # remaining datasets come from concepticon, we identify them from datasets
        concepticon_datasets = [d for d in datasets if d not in self.datasets]
        for dataset in concepticon_datasets:
            ds = concepticon.conceptlists[dataset]
            self.datasets[ds.id] = ConceptSetMeta(
                id=ds.id,
                author=ds.author,
                year=ds.year,
                tags=', '.join(ds.tags),
                source_language=ds.source_language,
                target_language=ds.target_language,
                url=ds.url,
                refs=ds.refs,
                note=ds.note,
                alias=ds.alias,
                norare=self,
                path=concepticon.repos.joinpath('concepticondata',
                                                'conceptlists',
                                                ds.id + '.tsv-metadata.json'))

        for dataset in self.datasets.values():
            if dataset.refs:
                refs = [dataset.refs] if isinstance(dataset.refs,
                                                    str) else dataset.refs
                for ref in refs:
                    if ref not in all_refs:  # pragma: no cover
                        raise ValueError(
                            'missing references.bib: {}'.format(ref))