def update(repos, verbose=True): ecoregions = [(er['properties']['eco_code'], shape(er['geometry'])) for er in jsonlib.load( data_file('ecoregions.json', repos=repos))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] with CsvData('distribution', repos=repos) as data: res = {i.id: i for i in data.items} occurrence_data = list( data_file('external', 'gbif', repos=repos).glob('*.json')) if verbose: # pragma: no cover occurrence_data = tqdm(occurrence_data) for fname in occurrence_data: sid = fname.stem d = res.get(sid, Distribution(sid, '', '')) if not d.countries__ids or not d.ecoregions__ids: occurrences = jsonlib.load(fname).get('results', []) if not d.ecoregions__ids: d.ecoregions__ids = list(match(occurrences, ecoregions)) if not d.countries__ids: d.countries__ids = list( r.get('countryCode') for r in occurrences) res[sid] = d data.items = [res[key] for key in sorted(res.keys())]
def get_bib(args): uploaded = load(args.data_file('repos', 'cdstar.json')) fname_to_cdstar = {} for type_ in ['texts', 'docs', 'data']: for hash_, paths in load(args.data_file('repos', type_ + '.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] for hash_, paths in load(args.data_file('repos', 'edmond.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True) for rec in db: doc = Document(rec) newurls = [] for url in rec.get('url', '').split(';'): if not url.strip(): continue if url.endswith('sequence=1'): newurls.append(url) continue url = URL(url.strip()) if url.host() in ['dogonlanguages.org', 'github.com', '']: fname = url.path().split('/')[-1] doc.files.append((fname, fname_to_cdstar[fname])) else: newurls.append(url.as_string()) doc.rec['url'] = '; '.join(newurls) yield doc
def issues(self): issues = jsonlib.load(self.issues_path) comments = jsonlib.load(self.comments_path) return [ Issue(issue, comments.get(str(issue['number']), [])) for issue in issues ]
def village_images(args): uploaded = load(args.data_file('repos', 'cdstar.json')) files = load( args.data_file('repos', 'Mali_villages_with_coordinates_for_website.json')) for hash_, paths in files.items(): if hash_ in uploaded: fname = Path(paths[0]) name, coords, desc, date_, creators = image_md(fname.stem) yield VillageImage(hash_, fname.name.decode('utf8'), VFN.get(fname.name), desc, date_, creators, coords, uploaded[hash_])
def ff_images(args): tsammalex = { i.id: i.taxa__id for i in reader(args.data_file('repos', 'tsammalex_images.csv'), namedtuples=True)} ref_pattern = re.compile('(?P<ref>[0-9]{5})') uploaded = load(args.data_file('repos', 'cdstar.json')) files = load(args.data_file('repos', 'Heath_flora_fauna_images.json')) files.update(load(args.data_file('repos', 'ffmissing.json'))) path_to_md5 = {} for md5, paths in files.items(): for path in paths: path_to_md5[Path(path.encode('utf8')).stem] = md5 missed, found, uploaded_ = 0, 0, 0 for i, img in enumerate(reader(args.data_file('repos', 'dogon_flora-fauna.csv'), delimiter=',', namedtuples=True)): stem = Path(img.filenames.encode('utf8')).stem assert stem in path_to_md5 found += 1 if path_to_md5[stem] in uploaded: m = ref_pattern.search(stem) uploaded_ += 1 yield FFImage( path_to_md5[stem], Path(files[path_to_md5[stem]][0].encode('utf8')).name, None, m.group('ref') if m else None, None, [], uploaded[path_to_md5[stem]], tsammalex.get(path_to_md5[stem])) videos = load(args.data_file('repos', 'videos_from_website.json')) videos.update(load(args.data_file('repos', 'videos.json'))) for md5, paths in videos.items(): if md5 in uploaded: path = Path(paths[0].encode('utf8')) m = ref_pattern.search(path.stem) uploaded_ += 1 yield FFImage( md5, path.name, None, m.group('ref') if m else None, None, [], uploaded[md5], tsammalex.get(md5)) else: missed += 1 print('ff_images', missed, uploaded_)
def chapter(request): _html = get_html(ppath("Atlas", "%s.html" % request.matchdict["id"])) return { "md": jsonlib.load(ppath("Atlas", "%s.json" % request.matchdict["id"])), "html": lambda vt: _html.replace("<p>value-table</p>", vt), "ctx": Feature.get(request.matchdict["id"]), }
def __init__(self, path, repos=REPOS, container_cls=dict, json_opts=None): DataManager.__init__(self, path, repos) if self.path.exists(): self.items = jsonlib.load(self.path, object_pairs_hook=OrderedDict) else: self.items = container_cls() self._json_opts = json_opts or {}
def __init__(self, path, cdstar_url=None, cdstar_user=None, cdstar_pwd=None): self.path = pathlib.Path(path) self.objects = {} if self.path.exists(): if self.path.suffix.lower() == '.zip': with zipfile.ZipFile(str(self.path), 'r') as z: for filename in z.namelist(): with z.open(filename) as f: self.objects = { i: Object.fromdict(i, d) for i, d in json.loads(f.read().decode( 'utf-8')).items() } break else: self.objects = { i: Object.fromdict(i, d) for i, d in load(self.path).items() } self.api = Cdstar(service_url=cdstar_url, user=cdstar_user, password=cdstar_pwd)
def test_dataset_from_file(self): from pycldf.dataset import Dataset ds = Dataset.from_file(FIXTURES.joinpath('ds1.csv')) self.assertIn('ds1', repr(ds)) self.assertEqual(len(ds), 2) self.assertEqual(ds.table.url, 'ds1.csv') self.assertEqual(ds.metadata['dc:creator'], 'The Author') row = ['3', 'abcd1234', 'fid2', 'maybe', '', 'new[4]'] with self.assertRaises(ValueError): ds.add_row(row) ds.sources.add('@book{new,\nauthor={new author}}') res = ds.add_row(row) self.assertEqual(res.url, 'http://example.org/valuesets/3') self.assertEqual(len(res.refs), 1) self.assertEqual( res.valueUrl('Language_ID'), 'http://glottolog.org/resource/languoid/id/abcd1234') res = ds.add_row(['4', None, None, None, None, None]) self.assertEqual(res.valueUrl('Language_ID'), None) out = self.tmp_path() ds.write(out, '.tsv') self.assertTrue(out.joinpath('ds1.bib').exists()) md = load(out.joinpath('ds1.tsv-metadata.json')) self.assertEqual('ds1.tsv', md['tables'][0]['url']) Dataset.from_file(out.joinpath('ds1.tsv'))
def test_get_subset(self): self.lex.get_subset([]) self.assertEquals([v for v in self.lex.subsets.values() if v], []) pairs = jsonlib.load(test_data('KSL.pairs.json')) self.assertEquals( sorted('---'.join(k) for k in self.lex.subsets.keys()), sorted(pairs.keys()))
def load(table, csv, engine): schema = jsonlib.load( csv.parent.joinpath(csv.stem + '.' + CsvmJsonAdapter.extension)) converter = get_converter(schema['tableSchema'], table) engine.execute(table.insert(), [converted(d, converter) for d in reader(csv, dicts=True)]) return schema.get("dc:identifier")
def contribute(req): return { 'missing': load( Path(asjp.__file__).parent.joinpath( 'static', 'ethnologue17_diff.json'))['missing'] }
def __init__(self, path): self.dir = path self.id = path.name self.cdstar = load(REPOS.joinpath('cdstar.json')) print(self.dir) assert self.dir.exists() desc = self.dir.joinpath('md.html') if desc.exists(): with desc.open(encoding='utf8') as fp: self.description = fp.read() else: self.description = None md = self.dir.joinpath('md.json') self.md = load(md) if md.exists() else None self.props = self.md.get('properties', {}) if self.md else {}
def iter_languages(): ldstatus = load( GLOTTOLOG_VENV.joinpath('glottolog3/glottolog3/static/ldstatus.json')) for l in Glottolog(GLOTTOLOG_VENV.joinpath('glottolog')).languoids(): if l.level == Level.language and not l.category.startswith('Pseudo'): yield Language(l, ((ldstatus.get(l.id) or [[0, None]])[0] or [0, None])[1])
def add_component(self, component, *cols, **kw): if isinstance(component, str): component = jsonlib.load( pkg_path('components', '{0}{1}'.format(component, MD_SUFFIX))) if isinstance(component, dict): component = Table.fromvalue(component) assert isinstance(component, Table) if kw.get('url'): component.url = Link(kw['url']) for other_table in self.tables: if other_table.url == component.url: raise ValueError('tables must have distinct url properties') self.add_columns(component, *cols) try: table_type = self.get_tabletype(component) except ValueError: table_type = None if table_type: for other_table in self.tables: try: other_table_type = self.get_tabletype(other_table) except ValueError: # pragma: no cover continue if other_table_type == table_type: raise ValueError('components must not be added twice') self.tables.append(component) component._parent = self.tablegroup self.auto_constraints(component) return component
def load(cls, path, contrib_md): # zenodo download dumps all files into a subfolder if not (path / 'cldf').exists(): for subpath in path.glob('*'): if (subpath / 'cldf').exists(): path = subpath break assert path.exists(), str(path) try: cldf_dataset = next(iter_datasets(path / 'cldf')) except StopIteration: raise ValueError('No cldf metadata file found in {}'.format(path)) bib_path = path / 'cldf' / 'sources.bib' sources = bibtex.Database.from_file( bib_path) if bib_path.exists() else None md_path = path / 'metadata.json' md = jsonlib.load(md_path) if md_path.exists() else {} # XXX maybe also allow README.txt? readme_path = path / 'README.md' try: with readme_path.open(encoding='utf-8') as f: readme = f.read().strip() except IOError: readme = None authors = contrib_md.get('authors') or () return cls(cldf_dataset, sources, authors, md.get('title'), readme)
def wals_detail_html(context=None, request=None, **kw): wals_data = Path(apics.__file__).parent.joinpath( 'static', 'wals', '%sA.json' % context.parameter.wals_id) if not wals_data.exists(): raise HTTPNotFound() wals_data = jsonlib.load(wals_data) value_map = {} for layer in wals_data['layers']: for feature in layer['features']: feature['properties']['icon'] = request.registry.getUtility( IIcon, name=feature['properties']['icon']).url(request) feature['properties']['popup'] = external_link( 'http://wals.info/languoid/lect/wals_code_' + feature['properties']['language']['id'], label=feature['properties']['language']['name']) value_map[layer['properties']['number']] = { 'icon': layer['features'][0]['properties']['icon'], 'name': layer['properties']['name'], 'number': layer['properties']['number'], } return { 'wals_data': wals_data, 'wals_map': WalsMap( context.parameter, request, data=wals_data, value_map=value_map), 'apics_map': ApicsWalsMap( context.parameter, request, data=wals_data, value_map=value_map)}
def register(args): # pragma: no cover """Register a dataset with datahub.io.""" dataset = Dataset.first() name = 'clld-' + dataset.id.lower() package = datahub('package_show', id=name) if not package: package = datahub( 'package_create', **{'name': name, 'title': 'CLLD-' + dataset.id.upper(), 'owner_org': 'clld'}) md = { 'url': 'http://%s' % dataset.domain, 'notes': '%s published by the CLLD project' % dataset.name, 'maintainer': 'CLLD Project', 'tags': [ {'name': 'linguistics'}, {'name': 'lod'}, {'name': 'llod'}, ]} if dataset.contact: md['maintainer_email'] = dataset.contact if dataset.license: if 'creativecommons.org/licenses/by/' in dataset.license: md['license_id'] = 'cc-by-sa' md['license_title'] = "Creative Commons Attribution Share-Alike" elif 'creativecommons.org/' in dataset.license and '-nc' in dataset.license: md['license_id'] = 'cc-nc' md['license_title'] = "Creative Commons Non-Commercial (Any)" rdf_md = args.data_file('rdf-metadata.json') if rdf_md.exists(): rdf_md = jsonlib.load(rdf_md) md['extras'] = [ {'key': k, 'value': str(rdf_md[k])} for k in rdf_md.keys() if k.split(':')[0] in ['triples', 'resources', 'links']] package = datahub('package_update', id=name, **md) resources = [rsc['name'] for rsc in package['resources']] if 'VoID description' not in resources: rsc = datahub( 'resource_create', package_id=package['id'], name='VoID description', url='http://%s/void.ttl' % dataset.domain, format='meta/void', mimetype='text/turtle') assert rsc rdf_dump = '%s-dataset.n3.gz' % dataset.id if ('RDF dump' not in resources) \ and args.module_dir.joinpath('static', 'download', rdf_dump).exists(): rsc = datahub( 'resource_create', package_id=package['id'], name='RDF dump', url='http://%s/static/download/%s' % (dataset.domain, rdf_dump), format='text/n3', mimetype='text/n3') assert rsc print('>>> Make sure to upload the RDF dump to the production site.')
def write_languoids_table(self, outdir, version=None): version = version or self.describe() if outdir is not None and not outdir.exists(): raise IOError("Specified output directory %s does not exist. Please create it." % outdir) out = outdir / 'glottolog-languoids-{0}.csv'.format(version) md = outdir / (out.name + '-metadata.json') tg = TableGroup.fromvalue({ "@context": "http://www.w3.org/ns/csvw", "dc:version": version, "dc:": "Harald Hammarström, Robert Forkel & Martin Haspelmath. " "clld/glottolog: Glottolog database (Version {0}) [Data set]. " "Zenodo. http://doi.org/10.5281/zenodo.596479".format(version), "tables": [load(pycldf.util.pkg_path('components', 'LanguageTable-metadata.json'))], }) tg.tables[0].url = out.name for col in [ dict(name='LL_Code'), dict(name='Classification', separator='/'), dict(name='Family_Glottocode'), dict(name='Family_Name'), dict(name='Language_Glottocode'), dict(name='Language_Name'), dict(name='Level', datatype=dict(base='string', format='family|language|dialect')), dict(name='Status'), ]: tg.tables[0].tableSchema.columns.append(Column.fromvalue(col)) langs = [] for lang in self.languoids(): lid, lname = None, None if lang.level == self.languoid_levels.language: lid, lname = lang.id, lang.name elif lang.level == self.languoid_levels.dialect: for lname, lid, level in reversed(lang.lineage): if level == self.languoid_levels.language: break else: # pragma: no cover raise ValueError langs.append(dict( ID=lang.id, Name=lang.name, Macroarea=lang.macroareas[0].name if lang.macroareas else None, Latitude=lang.latitude, Longitude=lang.longitude, Glottocode=lang.id, ISO639P3code=lang.iso, LL_Code=lang.identifier.get('multitree'), Classification=[c[1] for c in lang.lineage], Language_Glottocode=lid, Language_Name=lname, Family_Name=lang.lineage[0][0] if lang.lineage else None, Family_Glottocode=lang.lineage[0][1] if lang.lineage else None, Level=lang.level.name, Status=lang.endangerment.status.name if lang.endangerment else None, )) tg.to_file(md) tg.tables[0].write(langs, fname=out) return md, out
def test_json(self): from clldutils.jsonlib import dump, load d = {'a': 234, 'ä': 'öäüß'} p = self.tmp_path('test') dump(d, p) for k, v in load(p).items(): assert d[k] == v
def jsondump(obj, fname, log=None): fname = Path(fname) if fname.exists(): d = jsonlib.load(fname) d.update(obj) obj = d jsonlib.dump(sorted_obj(obj), fname, indent=4) log_dump(fname, log=log) return obj
def get_concept(s): global _concepticon if _concepticon is None: _concepticon = load(Path(dictionaria.__file__).parent.joinpath( 'static', 'concepticon-1.0-labels.json')) s = s.lower() if s in _concepticon['conceptset_labels']: return _concepticon['conceptset_labels'][s] return _concepticon['alternative_labels'].get(s)
def test_read_editors(api_copy): prepare_release(api_copy, '3.3') zenodo = load(api_copy.path('.zenodo.json')) assert zenodo['creators'][1]['affiliation'] == 'University Uppsala' assert zenodo['description'] == '<p>, C & Hammarström, Harald & Forkel, Robert. '\ '1999. Glottolog 3.3. ' \ 'Jena: Max Planck Institute for the Science of Human History. '\ '(Available online at ' \ '<a href="https://glottolog.org">https://glottolog.org</a>)</p>'
def __init__(self, path): self.dir = path self.id = path.name self.cdstar = load(REPOS.joinpath('cdstar.json')) print(self.dir) assert self.dir.exists() desc = self.dir.joinpath('intro.md') if desc.exists(): with desc.open(encoding='utf8') as fp: self.description = fp.read() else: self.description = None md = self.dir.joinpath('md.json') self.md = load(md) if md.exists() else None self.props = self.md.get('properties', {}) if self.md else {} bib = self.dir.joinpath('sources.bib') self.bib = bibtex.Database.from_file(bib) if bib.exists() else None
def cmd_makecldf(self, args): concepts = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Database_ID", ) language_map = { lang["ID"]: lang["Glottocode"] or None for lang in self.languages } sources = {} for path in sorted(self.raw_dir.glob("*.json"), key=lambda _p: int(_p.stem)): data = jsonlib.load(path) iso = data.get("ISO 639-3") if iso: iso = iso.strip() args.writer.add_language( ID=data["id"], Name=data["name"], ISO639P3code=iso if iso not in {"no", "XXX"} else None, Glottocode=language_map[data["id"]], ) for table in ["basic", "flora", "cult"]: if table not in data["tables"]: continue for item in data["tables"][table]["rows"]: item = dict(zip(data["tables"][table]["header"], item)) form = item["Orthographic Form"].strip() if form: refs = [ ref for ref in itersources(item, data, sources) if ref ] args.writer.add_sources(*[ref.source for ref in refs]) href, _ = item["English"] concept_database_id = href.split("/")[-1] if not concepts.get(concept_database_id): # https://huntergatherer.la.utexas.edu/lexical/feature/729 # is missing from the concept list(s) continue args.writer.add_lexemes( Language_ID=data["id"], Parameter_ID=concepts[concept_database_id], Value=form, Loan=bool(item["Loan Source"] or item["Wanderwort Status"]), Phonemic=item["Phonemicized Form"] or None, Source=["%s" % ref for ref in refs], Creator=item.get("Created By"), Comment=item.get("General Notes"), )
def test_SourcesCatalog(tmp_path): cat_path = tmp_path / 'test.json' with SourcesCatalog(cat_path) as cat: cat.add( 'key', Object('id', [Bitstream('bsid', 5, 'text/plain', '', '', '')], {})) assert 'key' in cat assert 'url' in cat.get('key') assert 'key' in load(str(cat_path))
def rename(args): # pragma: no cover api = Concepticon(args.repos) from_, to_ = args.args assert CONCEPTLIST_ID_PATTERN.match(to_) cl = api.conceptlists[from_] # write the adapted concept list to the new path: with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_), delimiter='\t') as writer: header = [] for i, row in enumerate(reader(cl.path, delimiter='\t')): if i == 0: header = row writer.writerow(row) header = {v: k for k, v in enumerate(header) } # Map col name to row index else: oid = row[header['ID']] assert oid.startswith(from_) nid = oid.replace(from_, to_) api.add_retirement( 'Concept', dict(id=oid, comment='renaming', replacement=nid)) row[header['ID']] = nid writer.writerow(row) # write adapted metadata to the new path: fname = cl.path.name.replace(from_, to_) + MD_SUFFIX md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX), object_pairs_hook=OrderedDict) md['tables'][0]['url'] = fname jsonlib.dump(md, cl.path.parent / fname, indent=4) # remove obsolete concept list and metadata: cl.path.unlink() cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink() # adapt conceptlists.tsv rows = [] for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'): rows.append([col.replace(from_, to_) if col else col for col in row]) with UnicodeWriter(api.data_path('conceptlists.tsv'), delimiter='\t') as writer: writer.writerows(rows) api.add_retirement('Conceptlist', dict(id=from_, comment='renaming', replacement=to_)) print("""Please run grep -r "{0}" concepticondata/ | grep -v retired.json to confirm the renaming was complete!""".format(from_))
def test(): if not REPOS.exists(): return data = { n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)]) for n in CSV } data['ecoregions'] = {} for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']: data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion data['refs'] = {} with data_file('sources.bib').open(encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: data['refs'][match.group('id')] = 1 data['countries'] = {country.alpha2: country for country in countries} for name in ['names', 'taxa']: for line, item in enumerate(data[name].values()): for ref in item.refs__ids: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): # pragma: no cover error('invalid reference %s' % (ref, ), name, line + 2) else: source_id = ref if source_id not in data['refs']: # pragma: no cover error('invalid id referenced: %s' % (source_id, ), name, line + 2) for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]: for line, item in enumerate(data[name].values()): for col in [f.name for f in attr.fields(model)]: if '__' in col: ref, cardinality = col.split('__', 1) #if ref not in data: # continue ids = getattr(item, col) if cardinality == 'id': assert not isinstance(ids, list) ids = [ids] for v in ids: if ref not in data: raise ValueError(ref) # pragma: no cover if ref == 'refs' and '[' in v: v = v.split('[')[0] if v not in data[ref]: # pragma: no cover error('invalid %s id referenced: %s' % (ref, v), name, line + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def vocabularies(self): """ Provide access to a `dict` of controlled vocabularies. """ res = jsonlib.load(self.data_path('concepticon.json')) for k in res['COLUMN_TYPES']: v = res['COLUMN_TYPES'][k] if isinstance(v, list) and v and v[0] == 'languoid': res['COLUMN_TYPES'][k] = Languoid(v[1]) return res
def _metadata(self, id_): values_path = self.data_path('concept_set_meta', id_ + '.tsv') md_path = self.data_path('concept_set_meta', id_ + '.tsv' + MD_SUFFIX) assert values_path.exists() and md_path.exists() md = jsonlib.load(md_path) return Metadata(id=id_, meta=md, values=to_dict( read_dicts(values_path, schema=md['tableSchema']), key=operator.itemgetter('CONCEPTICON_ID')))
def get_concept(s): global _concepticon if _concepticon is None: _concepticon = load( Path(dictionaria.__file__).parent.joinpath( 'static', 'concepticon-1.0-labels.json')) s = s.lower() if s in _concepticon['conceptset_labels']: return _concepticon['conceptset_labels'][s] return _concepticon['alternative_labels'].get(s)
def __init__(self, dataset, fname): self.dataset = dataset self.fname = fname if fname.exists(): try: self.report = jsonlib.load(fname) except ValueError: self.report = {} else: self.report = {}
def experiments(self): gbif = load(self.path('gbif.json')) res = [ Experiment.from_dict(d, self.sources) for d in list( dsv.reader(self.path('data.Sheet1.csv'), dicts=True))[1:] ] for ex in res: key, md = gbif.get(ex.species_latin, (None, None)) if key: ex.gbif = GBIF(key=key, metadata=md) return res
def cmd_readme(self, args): res = self.metadata.markdown() tr = self.cldf_dir / '.transcription-report.json' tr = jsonlib.load(tr) if tr.exists() else None res += report.report(self, tr, getattr(args, 'glottolog', None), args.log) if self.contributors_path.exists(): res += '\n\n{0}\n\n'.format( self.contributors_path.read_text(encoding='utf8')) self.dir.write('FORMS.md', self.form_spec.as_markdown(self)) return res
def test_makecldf(repos, dataset, dataset_cldf, dataset_no_cognates, sndcmp, capsys, tmp_path): _main('lexibank.makecldf {0} --glottolog {1} --concepticon {1} --clts {1}'. format( str(dataset.dir / 'td.py'), str(repos), )) assert 'Papunesia' in dataset.cldf_dir.joinpath('languages.csv').read_text( encoding='utf8') # Metadata for Zenodo is merged if this makes sense: assert len(jsonlib.load(dataset.dir / '.zenodo.json')['communities']) == 3 _main( 'lexibank.makecldf {0} --dev --glottolog {1} --concepticon {1} --clts {1}' .format( str(dataset.dir / 'td.py'), str(repos), )) assert 'Papunesia' not in dataset.cldf_dir.joinpath( 'languages.csv').read_text(encoding='utf8') assert '### Replacement' in dataset.dir.joinpath('FORMS.md').read_text( encoding='utf8') _main('lexibank.makecldf {0} --glottolog {1} --concepticon {1} --clts {1}'. format( str(sndcmp.dir / 'ts.py'), str(repos), )) assert 'Bislama_Gloss' in sndcmp.cldf_dir.joinpath( 'parameters.csv').read_text(encoding='utf8') assert 'e56a5fc78ae5a66e783c17bc30019568' in sndcmp.cldf_dir.joinpath( 'media.csv').read_text(encoding='utf8') _main('lexibank.makecldf {0} --glottolog {1} --concepticon {1} --clts {1}'. format( str(dataset_cldf.dir / 'tdc.py'), str(repos), )) capout = capsys.readouterr().out assert 'The dataset has no sources' not in capout _main('lexibank.makecldf {0} --glottolog {1} --concepticon {1} --clts {1}'. format( str(dataset_no_cognates.dir / 'tdn.py'), str(repos), )) assert not dataset_no_cognates.cldf_dir.joinpath('cognates.csv').exists() _main( 'lexibank.load --db {3} {0} --glottolog {1} --concepticon {2}'.format( str(dataset_no_cognates.dir / 'tdn.py'), str(repos), str(repos), str(tmp_path / 'db'), ))
def content_callback(request, context): if 'access' in request.url: # file download with zipfile.ZipFile(tmp_path / 'ds.zip', 'w') as zip: zip.write( pathlib.Path(__file__).parent / 'repos' / 'csv' / '2022-06-1KRR1P_ZIMBABWE_CRATON_ARCHEAN.csv', '2022-06-1KRR1P_ZIMBABWE_CRATON_ARCHEAN.csv') return tmp_path.joinpath('ds.zip').read_bytes() # Dataset metadata: return json.dumps(dict(data=load(repos / 'datasets.json')[0])).encode('utf8')
def test(): if not REPOS.exists(): return data = { n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)]) for n in CSV} data['ecoregions'] = {} for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']: data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion data['refs'] = {} with data_file('sources.bib').open(encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: data['refs'][match.group('id')] = 1 data['countries'] = {country.alpha2: country for country in countries} for name in ['names', 'taxa']: for line, item in enumerate(data[name].values()): for ref in item.refs__ids: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): # pragma: no cover error('invalid reference %s' % (ref,), name, line + 2) else: source_id = ref if source_id not in data['refs']: # pragma: no cover error('invalid id referenced: %s' % (source_id,), name, line + 2) for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]: for line, item in enumerate(data[name].values()): for col in [f.name for f in attr.fields(model)]: if '__' in col: ref, cardinality = col.split('__', 1) #if ref not in data: # continue ids = getattr(item, col) if cardinality == 'id': assert not isinstance(ids, list) ids = [ids] for v in ids: if ref not in data: raise ValueError(ref) # pragma: no cover if ref == 'refs' and '[' in v: v = v.split('[')[0] if v not in data[ref]: # pragma: no cover error( 'invalid %s id referenced: %s' % (ref, v), name, line + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def update(repos, log): ecoregions = [ (er['properties']['eco_code'], shape(er['geometry'])) for er in jsonlib.load(data_file('ecoregions.json', repos=repos))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] with CsvData('distribution', repos=repos) as data: res = {i.id: i for i in data.items} occurrence_data = list(data_file('external', 'gbif', repos=repos).glob('*.json')) for fname in tqdm(occurrence_data): sid = fname.stem d = res.get(sid, Distribution(sid, '', '')) if not d.countries__ids or not d.ecoregions__ids: occurrences = jsonlib.load(fname).get('results', []) if not d.ecoregions__ids: d.ecoregions__ids = list(match(occurrences, ecoregions, log)) if not d.countries__ids: d.countries__ids = list(r.get('countryCode') for r in occurrences) res[sid] = d data.items = [res[key] for key in sorted(res.keys())]
def load_whitelist(): """ Basic function to load the CLPA whitelist. """ _clpadata = jsonlib.load(local_path('clpa.main.json')) whitelist = {} for group in ['consonants', 'vowels', 'markers', 'tones', 'diphtongs']: for val in _clpadata[group]: whitelist[_clpadata[val]['glyph']] = _clpadata[val] whitelist[_clpadata[val]['glyph']]["ID"] = val return whitelist
def iter_files(args): files = defaultdict(list) for n in """ Burkina_flora_for_website.json data.json docs.json edmond.json ffmissing.json Heath_flora_fauna_images.json Mali_villages_with_coordinates_for_website.json texts.json videos_from_website.json videos.json """.split(): files.update(load(args.data_file('repos', n))) missing, matched = 0, 0 for md5, cdstar in load(args.data_file('repos', 'cdstar.json')).items(): if md5 in files: fnames = [Path(p.encode('utf8')).name.decode('utf8') for p in files[md5]] fname = sorted(fnames, key=lambda n: len(n))[-1] fname = fname.replace(' ', '_') if fname == 'Thumbs.db': continue m = date_p.search(fname) if m: d = date(int(m.group('y')), int(m.group('m')), int(m.group('d') or 1)) else: d = None yield File( md5, fname, guess_type(fname)[0].decode('utf8'), d, cdstar['size'], cdstar.get('duration'), cdstar) matched += 1 else: missing += 1 print('iter_files', missing, matched)
def get_text(what, id_, fmt): p = text_path(what, '{0}.{1}'.format(id_, fmt)) if not p.exists(): raise ValueError(p) if fmt == 'json': return jsonlib.load(p) text = read_text(p) if fmt == 'css': return text body = bs(text).find('body') body.name = 'div' body.attrs.clear() return '{0}'.format(body).replace('.popover(', '.clickover(')
def old_downloads(): from clldmpg import cdstar def bitstream_link(oid, spec): url = cdstar.SERVICE_URL.path( '/bitstreams/{0}/{1}'.format(oid, spec['bitstreamid'])).as_string() return HTML.a( '{0} [{1}]'.format(spec['bitstreamid'], format_size(spec['filesize'])), href=url) for number, spec in sorted( load(Path(__file__).parent.joinpath('static', 'downloads.json')).items()): yield number, [bitstream_link(spec['oid'], bs) for bs in spec['bitstreams']]
def get(dataset, resource, offset=0, limit=LIMIT, download_=False): fname = dataset.raw.joinpath("%(resource)s-%(limit)s-%(offset)s.json" % locals()) if fname.exists() and not download_: return jsonlib.load(fname) if not download_: raise ValueError res = requests.get("{0}/api/v1/{1}/".format(BASE_URL, resource), params=dict(format='json', limit='{0}'.format(limit), offset='{0}'.format(offset))).json() jsonlib.dump(res, fname) return res
def run(): terms = [] for e in read_terms().iter(): if ns('rdf:about') in e.attrib: terms.append(e.attrib[ns('rdf:about')]) for d in ['components', 'modules']: for f in walk(REPO_DIR.joinpath(d)): if f.suffix == '.json': md = load(f) for k, v in iterproperties(md): if k in ['propertyUrl', 'dc:conformsTo'] and v not in terms: print(f) print(v)
def downloads(req): mod = importlib.import_module(req.registry.settings['clld.pkg']) dls = Path(mod.__file__).parent.joinpath('static', 'downloads.json') print(dls) def bitstream_link(oid, spec): url = SERVICE_URL.path( '{0}/{1}'.format(oid, spec['bitstreamid'])).as_string() return HTML.a( '{0} [{1}]'.format(spec['bitstreamid'], format_size(spec['filesize'])), href=url) dls = load(dls) if dls.exists() else {} for rel, spec in sorted(dls.items()): yield rel, [bitstream_link(spec['oid'], bs) for bs in spec['bitstreams']]
def fixtures(type_, name): res = {} for fname in fixture_path(type_).iterdir(): name_, key = fname.stem.split('_') if name_ == name: value = fname if fname.suffix == '.json': value = jsonlib.load(fname) elif fname.suffix == '.html': with fname.open(encoding='utf8') as fp: value = fp.read() elif fname.suffix == '.xml': with open(fname.as_posix(), 'rb') as fp: value = fp.read() res[key] = value return res
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ concepticon = { c.GLOSS: c.CONCEPTICON_ID for c in reader(args.data_file('repos', 'conceptlist.tsv'), delimiter='\t', namedtuples=True) if c.CONCEPTICON_ID} sdata = jsonlib.load(args.data_file('repos', 'classification.json')) for concept in DBSession.query(models.Concept).options(joinedload(common.Parameter._files)): for t_ in ['image', 'video']: setattr(concept, 'count_{0}s'.format(t_), len(getattr(concept, t_ + 's'))) if concept.jsondata['ref'] in sdata: util.update_species_data(concept, sdata[concept.jsondata['ref']]) if concept.name in concepticon: concept.concepticon_id = int(concepticon[concept.name])
def __init__(self, path_or_id): if isinstance(path_or_id, Path): self.dir = path_or_id self.id = path_or_id.name else: self.id = path_or_id self.dir = REPOS.joinpath('submissions', path_or_id) assert self.dir.exists() md = self.dir.joinpath('md.json') self.md = load(md) if md.exists() else None self.db_name = None self.type = None if self.dir.joinpath('db.sfm').exists(): self.db_name = 'db.sfm' self.type = 'sfm' else: raise ValueError('no valid db file in %s' % self.dir)
def x(args): try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return fname = args.pkg_dir.joinpath('static', 'downloads.json') downloads = load(fname) release = args.args[0] with Catalog( Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: obj = cat.api.get_object(uid=downloads[release]['oid']) bitstreams = obj.bitstreams[:] for bs in bitstreams: print(bs.id, bs._properties)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path(os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families( Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path(os.path.expanduser('~')).joinpath('venvs/lexibank/lexibank-data') with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="lexibank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog = Glottolog( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} concepticon = Concepticon( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data')) conceptsets = {c['ID']: c for c in concepticon.conceptsets()} for dname in repos.joinpath('datasets').iterdir(): #if dname.name not in ['acbd']: # continue if dname.is_dir() and dname.name != '_template': #if dname.name != 'zenodo34092': # continue mdpath = dname.joinpath('metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families( Data(), DBSession.query(LexibankLanguage), glottolog=languoids, isolates_icon='tcccccc')
def dl2cdstar(args): app = app_name(args.project) if not app: args.log.error('cannot parse package name') return try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return title_pattern = re.compile('%s (?P<version>[0-9.]+) - downloads' % re.escape(app)) title = '{0} {1} - downloads'.format(app, args.version) pkg_dir = args.project.joinpath(app) with Catalog( Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: obj = cat.api.get_object() obj.metadata = {"creator": "pycdstar", "title": title} if args.args: obj.metadata["description"] = args.args[0] for fname in pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): print(fname.name) obj.add_bitstream( fname=fname.as_posix(), name=fname.name.replace('-', '_')) cat.add(obj) fname = pkg_dir.joinpath('static', 'downloads.json') with update(fname, default={}, indent=4) as downloads: for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if match.group('version') not in downloads: spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(fname)) args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))
def cached_metadata(self, sid, id=None, name=None, refresh=False): if data_file('external', self.name, repos=self.repos).is_dir(): fname = data_file('external', self.name, sid + '.json', repos=self.repos) if not fname.exists() or refresh: try: data = self.metadata(id or self.identify(name)) except: # pragma: no cover data = None if not data: return # pragma: no cover jsonlib.dump(data, fname) return data return jsonlib.load(fname) if sid not in self.items or refresh: try: self.items[sid] = self.metadata(id or self.identify(name)) except: return return self.items[sid]
def survey(request): id_ = request.matchdict["id"] md = jsonlib.load(ppath("Surveys", "%s.json" % id_)) html = get_html(ppath("Surveys", "%s.html" % id_)) maps = [] for fname in sorted( ppath("Surveys", processed="maps").glob("%s*.png" % id_.split(".")[1].replace("-", "_")), key=lambda fn: fn.stem ): img = b64encode(open(fname.as_posix(), "rb").read()) if "figure" in fname.stem: html = html.replace("{%s}" % fname.stem, "data:image/png;base64,%s" % img) else: maps.append(img) return { "maps": maps, "md": md, "authors": [Contributor.get(a["id"]) for a in md["authors"]], "html": html, "ctx": ApicsContribution.get(id_.split(".")[0]), }
def test_read_write(self): from pycldf.csv import Reader, Writer table = load(FIXTURES.joinpath('ds1.csv-metadata.json'))['tables'][0] table['tableSchema']['columns'][0]['datatype'] = 'integer' table['url'] = 'test.tsv' row = '1,abcd1234,fid1,yes,,80086;meier2015[2-5]'.split(',') with Archive(self.tmp_path('test.zip'), 'w') as archive: with Writer(table, container=archive) as writer: writer.writerow(row) with Archive(self.tmp_path('test.zip')) as archive: with Reader(table, container=archive) as reader: rows = list(reader) self.assertEqual(rows[0]['ID'], 1) self.assertEqual( rows[0].valueUrl('Language_ID'), 'http://glottolog.org/resource/languoid/id/abcd1234') self.assertEqual(rows[0].to_list(), row) table = Table(table) del table.dialect['header'] self.assertTrue(table.dialect.header) del table.dialect['delimiter'] self.assertEqual(table.dialect.delimiter, ',') table.dialect.header = False with Writer(table, container=self.tmp_path()) as writer: writer.writerow(row) writer.writerows(rows) with Reader(table, container=self.tmp_path()) as reader: rows = list(reader) self.assertEqual(rows[0]['ID'], 1) self.assertEqual( rows[0].valueUrl('Language_ID'), 'http://glottolog.org/resource/languoid/id/abcd1234') self.assertEqual(rows[0].to_list(), row)
def datasets(args): """ cldf datasets <DIR> [ATTRS] List all CLDF datasets in directory <DIR> """ if len(args.args) < 1: raise ParserError('not enough arguments') d = Path(args.args[0]) if not d.exists() or not d.is_dir(): raise ParserError('%s is not an existing directory' % d) for fname in sorted(d.glob('*' + MD_SUFFIX), key=lambda p: p.name): md = Metadata(load(fname)) data = fname.parent.joinpath( md.get_table().url or fname.name[:-len(MD_SUFFIX)]) if data.exists(): print(data) if len(args.args) > 1: maxlen = max(len(a) for a in args.args[1:]) for attr in args.args[1:]: if md.get(attr): print(' %s %s' % ((attr + ':').ljust(maxlen + 1), md[attr]))