def from_path(cls, path, spec=None): """ Instantiate a corpus from a file path. :param path: Either a path to a CLDF dataset's metadata file or to a CLDF Examples \ component as CSV file. Note that in the latter case, the file must use the default \ column names, as defined in the CLDF ontology. :return: `Corpus` instance. """ if isinstance(path, str): path = pathlib.Path(path) if path.suffix == '.json': return cls.from_cldf(Dataset.from_metadata(path), spec=spec) # We are given only an ExampleTable. Let's create the appropriate dataset: header = None for d in reader(path, dicts=True): header = list(d.keys()) break ds = Dataset.from_metadata( pathlib.Path(pycldf.__file__).parent / 'modules' / 'Generic-metadata.json') ds.tablegroup._fname = path.parent / 'cldf-metadata.json' t = ds.add_component('ExampleTable') t.url = Link(path.name) default_cols = [col.name for col in t.tableSchema.columns] ds.remove_columns(t, *list(set(default_cols) - set(header))) ds.add_columns(t, *list(set(header) - set(default_cols))) return cls.from_cldf(ds, spec=spec)
def get_dataset(fname=None): """Load a CLDF dataset. Load the file as `json` CLDF metadata description file, or as metadata-free dataset contained in a single csv file. The distinction is made depending on the file extension: `.json` files are loaded as metadata descriptions, all other files are matched against the CLDF module specifications. Directories are checked for the presence of any CLDF datasets in undefined order of the dataset types. Parameters ---------- fname : str or Path Path to a CLDF dataset Returns ------- pycldf.Dataset """ if fname is None: fname = repository else: fname = Path(fname) if not fname.exists(): raise FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': return Dataset.from_metadata(fname) return Dataset.from_data(fname)
def test_cldf(tmp_path): with pytest.raises(AttributeError): _ = CLDFWriter().cldf with CLDFWriter(CLDFSpec(dir=tmp_path)): pass # The metadata was copied: assert tmp_path.glob('*-metadata.json') with CLDFWriter( CLDFSpec(module='StructureDataset', dir=tmp_path, data_fnames=dict(ValueTable='data.csv', ExampleTable='igt.csv'))) as writer: assert writer.cldf['ValueTable'] and writer.cldf['ExampleTable'] writer['ValueTable', 'value'].separator = '|' writer.objects['ValueTable'].append( dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2])) assert tmp_path.joinpath('data.csv').exists() ds = Dataset.from_metadata(tmp_path / 'StructureDataset-metadata.json') values = list(ds['ValueTable']) assert len(values) == 1 assert values[0]['Value'] == ['1', '2'] with pytest.raises(AttributeError): CLDFWriter(tmp_path).validate()
def test_cldf_with_dataset(ds): with CLDFWriter(CLDFSpec(dir=ds.cldf_dir), dataset=ds): pass cldf = Dataset.from_metadata(ds.cldf_dir.joinpath('Generic-metadata.json')) assert 'http://example.org/raw' in [ p['rdf:about'] for p in cldf.properties['prov:wasDerivedFrom'] ]
def get_dataset(p): try: return Dataset.from_metadata( p) if p.suffix == '.json' else Dataset.from_data(p) except ValueError: raise argparse.ArgumentTypeError( 'Invalid CLDF dataset spec: {0}!'.format(p))
def run(args): if (args.glottolog or args.concepticon) and Catalog is None: # pragma: no cover print( 'To use reference catalogs you must install the cldfcatalog package!' ) return 10 if args.cldf: # pragma: no cover args.cldf = Dataset.from_metadata(args.cldf) with contextlib.ExitStack() as stack: if not args.prime_cache_only: stack.enter_context( db.FreshDB.from_settings(args.settings, log=args.log)) stack.enter_context(SessionContext(args.settings)) for name in ['concepticon', 'glottolog']: if getattr(args, name): # pragma: no cover if getattr(args, name + '_version'): stack.enter_context( Catalog(getattr(args, name), tag=getattr(args, name + '_version'))) else: setattr(args, name, pathlib.Path(getattr(args, name))) if not args.prime_cache_only: with transaction.manager: if args.initializedb: # pragma: no cover args.initializedb.main(args) if hasattr(args.initializedb, 'prime_cache'): with transaction.manager: # pragma: no cover args.initializedb.prime_cache(args)
def _make_client(cldf_md, tmpdir): cldf = Dataset.from_metadata(cldf_md) dbpath = tmpdir / 'db.sqlite' db = Database(cldf, fname=dbpath, infer_primary_keys=True) db.write_from_tg() return make_app_client(dbpath, metadata=datasette_cldf.metadata({'db': cldf}))
def run(args): args.env, settings = get_env_and_settings(args.config_uri) with contextlib.ExitStack() as stack: stack.enter_context(db.FreshDB.from_settings(settings, log=args.log)) stack.enter_context(SessionContext(settings)) args.cldf = Dataset.from_metadata(args.abvd_cldf / 'cldf' / 'cldf-metadata.json') if not args.prime_cache_only: with transaction.manager: main(args) with transaction.manager: prime_cache(args)
def test_cldf(tmpdir): with pytest.raises(AttributeError): _ = CLDFWriter().cldf outdir = pathlib.Path(str(tmpdir)) with CLDFWriter(CLDFSpec(dir=outdir)): pass # The metadata was copied: assert outdir.glob('*-metadata.json') with CLDFWriter(CLDFSpec(dir=outdir, data_fnames=dict(ValueTable='data.csv'))) as writer: assert writer.cldf['ValueTable'] writer['ValueTable', 'value'].separator = '|' writer.objects['ValueTable'].append( dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2])) ds = Dataset.from_metadata(outdir / 'Generic-metadata.json') values = list(ds['ValueTable']) assert len(values) == 1 assert values[0]['Value'] == ['1', '2'] with pytest.raises(AttributeError): CLDFWriter(outdir).validate()
def test_Record_download_dataset(tmp_path, mocker, tests_dir, caplog, record): class Response: def __init__(self, *args): self.yielded = False self.code = 200 def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def read(self): return tests_dir.joinpath( 'petersonsouthasia-v1.1.zip').read_bytes() mocker.patch('cldfzenodo.record.urllib.request.urlopen', Response) with caplog.at_level(logging.INFO): assert Dataset.from_metadata( record.download_dataset( tmp_path, log=logging.getLogger(__name__))).validate() assert caplog.records
from collections import Counter from tabulate import tabulate from pycldf import Dataset # Retrieve the WALS v2020 data from GitHub. # (The proper release on Zenodo is zipped and would need to be downloaded first): wals = Dataset.from_metadata( 'https://raw.githubusercontent.com/cldf-datasets/wals/v2020/cldf/StructureDataset-metadata.json' ) # Get feature 1A ... feature1 = wals.get_object('ParameterTable', '1A') # ... and look at its values: values = Counter(v.code.name for v in feature1.values) print('\n{}\n\n{}'.format(feature1.name, tabulate(values.most_common(), tablefmt='github')))
def run(args): ds = None if Zenodo.DOI_PATTERN.match(args.dataset): z = Zenodo() out = z.download_record(z.record_from_doi(args.dataset), pathlib.Path('.')) args.log.info('Downloaded files for {0} to {1}'.format( args.dataset, out)) cldf_ds = list(iter_datasets(out)) else: p = pathlib.Path(args.dataset) if p.exists() and sniff(p): cldf_ds = [Dataset.from_metadata(p)] else: # pragma: no cover ds = get_dataset(args) cldf_ds = [ds.cldf_reader()] if not cldf_ds: raise ValueError('No CLDF dataset found for spec {0}'.format( args.dataset)) try: count_p = max([len(list(cldf['ParameterTable'])) for cldf in cldf_ds]) except KeyError: count_p = 100 default_page_size = 100 while default_page_size < count_p and default_page_size < 600: default_page_size += 100 # pragma: no cover # max_returned_rows Maximum rows that can be returned from a table # or custom query (default=1000) db_paths = [] if args.db_path: # pragma: no cover if len(cldf_ds) > 1: raise ValueError( 'You cannot pass a db path, when multiple datasets are found') else: args.db_path = pathlib.Path( '{0}.sqlite'.format(ds.id if ds else 'cldf_db')) for i, cldf in enumerate(cldf_ds): if i == 0: db_path = args.db_path else: db_path = args.db_path.parent / ( args.db_path.stem + '_{0}'.format(i) + args.db_path.suffix) if not db_path.exists(): db = Database(cldf, fname=db_path, infer_primary_keys=True) db.write_from_tg() args.log.info('{0} loaded in {1}'.format(db.dataset, db.fname)) db_paths.append(db_path) jsonlib.dump(datasette_cldf.metadata( {db.stem: cldf for db, cldf in zip(db_paths, cldf_ds)}), args.cfg_path, indent=4) os.system( 'datasette {0} -m {1} --template-dir {2} --config default_page_size:{3}' .format(' '.join(str(p) for p in db_paths), args.cfg_path, pathlib.Path(datasette_cldf.__file__).parent / 'templates', default_page_size))
def cldf_dataset(): return Dataset.from_metadata( Path(pylexibank.__file__).parent / 'cldf-metadata.json')
def Wordlist(): return Dataset.from_metadata( pathlib.Path(__file__).parent / 'Wordlist' / 'Wordlist-metadata.json')
def StructureDataset(): return Dataset.from_metadata( pathlib.Path(__file__).parent / 'StructureDataset' / 'StructureDataset-metadata.json')
def Generic(): return Dataset.from_metadata( pathlib.Path(__file__).parent / 'Generic' / 'Generic-metadata.json')
def Dictionary(): return Dataset.from_metadata( pathlib.Path(__file__).parent / 'Dictionary' / 'Dictionary-metadata.json')
def cldf_dataset(pytestconfig): from pycldf import Dataset return Dataset.from_metadata(pytestconfig.getoption('cldf_metadata'))
def dataset(metadata_path): return Dataset.from_metadata(metadata_path)
def multilingual_dataset(fixtures): return Dataset.from_metadata(fixtures / 'multilingual' / 'cldf-metadata.json')
def get_dataset(args): if args.dataset.suffix == '.json': return Dataset.from_metadata(args.dataset) return Dataset.from_data(args.dataset)
def main(args): for (org, repos), recs in itertools.groupby( sorted(oai.Records('tular'), key=lambda r: (r.repos.org, r.repos.repos, r.version), reverse=True), lambda r: (r.repos.org, r.repos.repos), ): if org == 'tupian-language-resources' and repos in DATASETS: DATASETS[repos] = next(recs) data = Data() dataset = data.add( common.Dataset, 'tular', id=tular.__name__, domain="tular.clld.org", name="TuLaR", description="Tupían Language Resources", publisher_name="Max-Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", license='https://creativecommons.org/licenses/by-sa/4.0/', contact="*****@*****.**", jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, ) rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve() root = input('Project dir [{}]: '.format(str(rd))) root = pathlib.Path(root) if root else rd clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data') for db, rec in DATASETS.items(): print(db, rec.doi, rec.tag) dbdir = root.joinpath(db) assert dbdir.exists() md = jsonlib.load(dbdir / 'metadata.json') name = md['title'] if md['description']: name += ': {}'.format(md['description']) contribution = data.add( Database, db, id=db, name=name, description=rec.citation if rec else None, doi=rec.doi if rec else None, ) header, contribs = next( iter_markdown_tables( dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8'))) for i, contrib in enumerate(contribs): contrib = dict(zip(header, contrib)) cid = slug(HumanName(contrib['Name']).last) contributor = data['Contributor'].get(cid) if not contributor: contributor = data.add( common.Contributor, cid, id=cid, name=contrib['Name'], description=contrib.get('Affiliation'), ) DBSession.add( common.ContributionContributor( contribution=contribution, contributor=contributor, primary='author' in contrib['Role'].lower(), ord=i, )) for i, cid in enumerate( ['gerardi', 'reichert', 'aragon', 'list', 'forkel']): DBSession.add( common.Editor(contributor=data['Contributor'][cid], dataset=dataset, ord=i)) source_ids = list(add_sources(args.cldf.bibpath, DBSession)) sources = {s.id: s.pk for s in DBSession.query(common.Source)} subgroups = [] for row in args.cldf['LanguageTable']: if row['SubGroup'] not in subgroups: subgroups.append(row['SubGroup']) family = data['Family'].get(row['Family']) if (not family) and row['Family']: family = data.add(Family, row['Family'], id=slug(row['Family']), name=row['Family']) data.add( Doculect, row['ID'], id=row['ID'], name=row['Name'].replace('_', ' '), family=family, subfamily=row['SubGroup'], iso_code=row['ISO639P3code'], glotto_code=row['Glottocode'], longitude=row['Longitude'], latitude=row['Latitude'], jsondata=dict(icon=SUBGROUPS[row['SubGroup']]), ) tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' / 'Generic-metadata.json') seen = set() for row in tudet['ExampleTable']: if row['ID'] in seen: print('skipping duplicate sentence ID {}'.format(row['ID'])) continue seen.add(row['ID']) DBSession.add( Example(id=row['ID'], name=row['Primary_Text'], description=row['Translated_Text'], language=data['Doculect'][row['Language_ID']], conllu=row['conllu'])) contrib = data['Database']['tuled'] for row in args.cldf['ParameterTable']: data.add( Concept, row['ID'], id=row['ID'].split('_')[0], name=row['Name'], portuguese=row['Portuguese_Gloss'], semantic_field=row['Semantic_Field'], concepticon_class=row['Concepticon_ID'], eol=row['EOL_ID'], ) for (lid, pid), rows in itertools.groupby( sorted(args.cldf.iter_rows('FormTable', 'languageReference', 'parameterReference'), key=lambda r: (r['Language_ID'], r['Parameter_ID'])), lambda r: (r['Language_ID'], r['Parameter_ID']), ): vsid = '{}-{}'.format(lid, pid) vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Doculect'][lid], parameter=data['Concept'][pid], contribution=contrib, ) refs = set() for row in rows: data.add( Word, row['ID'], id=row['ID'], valueset=vs, name=row['Form'], tokens=' '.join(row['Segments']), simple_cognate=int(row['SimpleCognate']), notes=row['Comment'], morphemes=' '.join(row['Morphemes']), partial_cognate=' '.join([k for k in row['PartialCognates']]) if row['PartialCognates'] else None, ) refs = refs.union(row['Source']) for ref in refs: if ref in source_ids: DBSession.add( common.ValueSetReference(valueset=vs, source_pk=sources[slug( ref, lowercase=False)])) load_inventories(args.cldf, clts, data['Doculect']) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add( Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'], name=row['Cognateset_ID'], contribution=contrib, ) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Word'][row['Form_ID']], alignment=' '.join(row['Alignment'] or []), )