def common_props(self): res = { "dc:title": self.title, "dc:description": self.description, "dc:bibliographicCitation": self.citation, "dc:license": licenses.find(self.license or ''), "dc:identifier": self.url, "dc:format": [ "http://concepticon.clld.org/contributions/{0}".format(cl) for cl in self.conceptlist ], "dc:isVersionOf": "http://lexibank.clld.org/contributions/{0}".format( self.derived_from) if self.derived_from else None, "dc:related": self.related, "aboutUrl": self.aboutUrl } if self.known_license: res['dc:license'] = self.known_license.url elif self.license: res['dc:license'] = self.license return res
def ls(args): """ gelato ls [COLS]+ column specification: - license - macroareas """ table = Table('ID', 'Title') cols = [col for col in args.args if col in ['license', 'macroareas']] tl = 40 if args.args: tl = 25 table.columns.extend(col.capitalize() for col in cols) for d in data_path(repos=Path(args.gelato_repos)).iterdir(): if is_dataset_dir(d): ds = Dataset(d) row = [d.name, ds.md['dc:title']] for col in cols: if col == 'license': lic = licenses.find(ds.md.get('dc:license') or '') row.append(lic.id if lic else ds.md.get('dc:license')) table.append(row) print( table.render(tablefmt='simple', sortkey=lambda r: r[0], condensed=False))
def bibtex(self): src = Source( 'misc', self.doi.split('/')[-1].replace('.', '-'), author=' and '.join(self.creators), title=self.title, keywords=', '.join(self.keywords), publisher='Zenodo', year=self.year, doi=self.doi, url='https://doi.org/{}'.format(self.doi), ) if self.license: lic = licenses.find(self.license) src['copyright'] = lic.name if lic else self.license return src.bibtex()
def ls(args): """ lexibank ls [COLS]+ column specification: - license - lexemes - macroareas """ # FIXME: how to smartly choose columns? table = Table('ID', 'Title') cols = [ col for col in args.args if col in ['license', 'lexemes', 'macroareas'] ] tl = 40 if args.args: tl = 25 table.columns.extend(col.capitalize() for col in cols) for d in data_path(repos=Path(args.lexibank_repos)).iterdir(): if is_dataset_dir(d): ds = Dataset(d) row = [d.name, short_title(ds.md['dc:title'], l=tl)] for col in cols: if col == 'license': lic = licenses.find(ds.md.get('dc:license') or '') row.append(lic.id if lic else ds.md.get('dc:license')) elif col in ['lexemes', 'macroareas']: mds = list(ds.iter_cldf_metadata()) if col == 'lexemes': row.append( sum(md.notes['stats']['lexeme_count'] for md in mds)) elif col == 'macroareas': mas = set() for md in mds: mas = mas.union(md.notes['stats']['macroareas']) row.append(', '.join(sorted(mas))) table.append(row) print( table.render(tablefmt='simple', sortkey=lambda r: r[0], condensed=False))
def known_license(self): if self.license: return licenses.find(self.license)
def ls(args): """ lexibank ls [COLS]+ column specification: - license - lexemes - macroareas """ db = Database(args.db) db.create(exists_ok=True) in_db = { r[0]: r[1] for r in db.fetchall('select id, version from dataset') } # FIXME: how to smartly choose columns? table = Table('ID', 'Title') cols = OrderedDict([(col, {}) for col in args.args if col in [ 'version', 'location', 'changes', 'license', 'all_lexemes', 'lexemes', 'concepts', 'languages', 'families', 'varieties', 'macroareas', ]]) tl = 40 if cols: tl = 25 table.columns.extend(col.capitalize() for col in cols) for col, sql in [ ('languages', 'glottocodes_by_dataset'), ('concepts', 'conceptsets_by_dataset'), ('lexemes', 'mapped_lexemes_by_dataset'), ('all_lexemes', 'lexemes_by_dataset'), ('macroareas', 'macroareas_by_dataset'), ('families', 'families_by_dataset'), ]: if col in cols: cols[col] = {r[0]: r[1] for r in db.fetchall(sql)} for ds in args.cfg.datasets: row = [ colored(ds.id, 'green' if ds.id in in_db else 'red'), truncate_with_ellipsis(ds.metadata.title or '', width=tl), ] for col in cols: if col == 'version': row.append(git_hash(ds.dir)) elif col == 'location': row.append(colored(str(ds.dir), 'green')) elif col == 'changes': row.append(ds.git_repo.is_dirty()) elif col == 'license': lic = licenses.find(ds.metadata.license or '') row.append(lic.id if lic else ds.metadata.license) elif col in [ 'languages', 'concepts', 'lexemes', 'all_lexemes', 'families' ]: row.append(float(cols[col].get(ds.id, 0))) elif col == 'macroareas': row.append(', '.join( sorted((cols[col].get(ds.id) or '').split(',')))) else: row.append('') table.append(row) totals = ['zztotal', len(args.cfg.datasets)] for i, col in enumerate(cols): if col in ['lexemes', 'all_lexemes']: totals.append(sum([r[i + 2] for r in table])) elif col == 'languages': totals.append( float( db.fetchone( "SELECT count(distinct glottocode) FROM languagetable") [0])) elif col == 'concepts': totals.append( float( db.fetchone( "SELECT count(distinct concepticon_id) FROM parametertable" )[0])) elif col == 'families': totals.append( float( db.fetchone( "SELECT count(distinct family) FROM languagetable") [0])) else: totals.append('') table.append(totals) print( table.render(tablefmt='simple', sortkey=lambda r: r[0], condensed=False, floatfmt=',.0f'))
def test_find(): assert find('http://creativecommons.org/licenses/by/4.0').id == 'CC-BY-4.0' assert find( 'CC-BY-4.0').url == 'https://creativecommons.org/licenses/by/4.0/'
def test_legalcode(): assert find('cc-by-4.0').legalcode assert find('Zlib').legalcode is None
def test_find(): from clldutils.licenses import find assert find('http://creativecommons.org/licenses/by/4.0').id == 'CC-BY-4.0' assert find('CC-BY-4.0').url == 'https://creativecommons.org/licenses/by/4.0/'
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def run(args): db = get_db(args) in_db = {r[0]: r[1] for r in db.fetchall('select id, version from dataset')} table = Table('ID', 'Title') cols = collections.OrderedDict([ (col, {}) for col in COLS if getattr(args, col, None) or args.all]) tl = 40 if cols: tl = 25 table.columns.extend(col.capitalize() for col in cols) for col, sql in [ ('languages', 'glottocodes_by_dataset'), ('concepts', 'conceptsets_by_dataset'), ('lexemes', 'mapped_lexemes_by_dataset'), ('all_lexemes', 'lexemes_by_dataset'), ('macroareas', 'macroareas_by_dataset'), ('families', 'families_by_dataset'), ]: if col in cols: cols[col] = {r[0]: r[1] for r in db.fetchall(sql)} datasets = get_datasets(args) for ds in datasets: row = [ termcolor.colored(ds.id, 'green' if ds.id in in_db else 'red'), textwrap.shorten(ds.metadata.title or '', width=tl), ] for col in cols: if col == 'version': row.append(ds.repo.hash()) elif col == 'location': row.append(termcolor.colored(str(ds.dir), 'green')) elif col == 'changes': row.append(ds.repo.is_dirty()) elif col == 'license': lic = licenses.find(ds.metadata.license or '') row.append(lic.id if lic else ds.metadata.license) elif col in ['languages', 'concepts', 'lexemes', 'all_lexemes', 'families']: row.append(float(cols[col].get(ds.id, 0))) elif col == 'macroareas': row.append(', '.join(sorted((cols[col].get(ds.id) or '').split(',')))) else: row.append('') table.append(row) totals = ['zztotal', len(datasets)] for i, col in enumerate(cols): if col in ['lexemes', 'all_lexemes']: totals.append(sum([r[i + 2] for r in table])) elif col == 'languages': totals.append(float(db.fetchone( "SELECT count(distinct glottocode) FROM languagetable")[0])) elif col == 'concepts': totals.append(float(db.fetchone( "SELECT count(distinct concepticon_id) FROM parametertable")[0])) elif col == 'families': totals.append(float(db.fetchone( "SELECT count(distinct family) FROM languagetable")[0])) else: totals.append('') table.append(totals) print(table.render( tablefmt='simple', sortkey=lambda r: r[0], condensed=False, floatfmt=',.0f'))
def _main(data, glottolog): languoids = list(glottolog.languoids()) lbyi = {l.iso: l for l in languoids if l.iso} dataset = common.Dataset( id='ldh', name='Language Description Heritage', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='ldh.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) DBSession.add( common.Editor(dataset=dataset, contributor=common.Contributor(id='forkel', name='Robert Forkel'))) ls = set() for post in iter_posts(): if post.pure_item_id: item = pure.Item.from_json(post.pure_item_id) src = data['Description'].get(item.id) if not src: src = data.add( models.Description, item.id, id=item.id, description=item.title, name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item.title, address=item.publisher.get('place') if item.publisher else None, publisher=item.publisher.get('publisher') if item.publisher else None, author=' and '.join(item.authors), editor=' and '.join(item.editors), pid=item.doi or item.pid, pid_type='doi' if item.doi else 'hdl', ) DBSession.flush() for file in item.files: if file.visibility == 'PUBLIC' \ and file.metadata["contentCategory"] == "any-fulltext"\ and file.storage == 'INTERNAL_MANAGED': assert file.mimeType == 'application/pdf' DBSession.add( common.Source_files( id=file.pid.replace('/', '__'), name=file.name, object_pk=src.pk, mime_type=file.mimeType, jsondata=dict(size=file.size, license=attr.asdict(file.license) if file.license else None), )) for iso in item.isocodes: if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) for item in zenodo.iter_items(): src = data.add( models.Description, item.id, id=item.id, description=item['metadata']['title'], name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item['metadata']['title'], publisher='Zenodo', author=' and '.join(a['name'] for a in item['metadata']['creators']), pid=item['metadata']['doi'], pid_type='doi', ) DBSession.flush() for file in item['files']: license = licenses.find(item['metadata']['license']['id']) DBSession.add( common.Source_files( id=file['checksum'].replace('md5:', ''), name=file['key'], object_pk=src.pk, mime_type='application/' + file['type'], jsondata=dict( size=file['size'], url=file['links']['self'], license=attr.asdict(license) if license else None), )) for kw in item['metadata']['keywords']: if not kw.startswith('iso:'): continue iso = kw.replace('iso:', '') if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) load_families(data, data['LDHLanguage'].values(), glottolog_repos=glottolog.repos, isolates_icon='tcccccc')
def license(self): return find((self.metadata.get('license') or '').strip())
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )