예제 #1
0
    def common_props(self):
        res = {
            "dc:title":
            self.title,
            "dc:description":
            self.description,
            "dc:bibliographicCitation":
            self.citation,
            "dc:license":
            licenses.find(self.license or ''),
            "dc:identifier":
            self.url,
            "dc:format": [
                "http://concepticon.clld.org/contributions/{0}".format(cl)
                for cl in self.conceptlist
            ],
            "dc:isVersionOf":
            "http://lexibank.clld.org/contributions/{0}".format(
                self.derived_from) if self.derived_from else None,
            "dc:related":
            self.related,
            "aboutUrl":
            self.aboutUrl
        }
        if self.known_license:
            res['dc:license'] = self.known_license.url
        elif self.license:
            res['dc:license'] = self.license

        return res
예제 #2
0
def ls(args):
    """
    gelato ls [COLS]+

    column specification:
    - license
    - macroareas
    """
    table = Table('ID', 'Title')
    cols = [col for col in args.args if col in ['license', 'macroareas']]
    tl = 40
    if args.args:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)
    for d in data_path(repos=Path(args.gelato_repos)).iterdir():
        if is_dataset_dir(d):
            ds = Dataset(d)
            row = [d.name, ds.md['dc:title']]
            for col in cols:
                if col == 'license':
                    lic = licenses.find(ds.md.get('dc:license') or '')
                    row.append(lic.id if lic else ds.md.get('dc:license'))

            table.append(row)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False))
예제 #3
0
파일: record.py 프로젝트: cldf/cldf-zenodo
 def bibtex(self):
     src = Source(
         'misc',
         self.doi.split('/')[-1].replace('.', '-'),
         author=' and '.join(self.creators),
         title=self.title,
         keywords=', '.join(self.keywords),
         publisher='Zenodo',
         year=self.year,
         doi=self.doi,
         url='https://doi.org/{}'.format(self.doi),
     )
     if self.license:
         lic = licenses.find(self.license)
         src['copyright'] = lic.name if lic else self.license
     return src.bibtex()
예제 #4
0
def ls(args):
    """
    lexibank ls [COLS]+

    column specification:
    - license
    - lexemes
    - macroareas
    """
    # FIXME: how to smartly choose columns?
    table = Table('ID', 'Title')
    cols = [
        col for col in args.args
        if col in ['license', 'lexemes', 'macroareas']
    ]
    tl = 40
    if args.args:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)
    for d in data_path(repos=Path(args.lexibank_repos)).iterdir():
        if is_dataset_dir(d):
            ds = Dataset(d)
            row = [d.name, short_title(ds.md['dc:title'], l=tl)]
            for col in cols:
                if col == 'license':
                    lic = licenses.find(ds.md.get('dc:license') or '')
                    row.append(lic.id if lic else ds.md.get('dc:license'))
                elif col in ['lexemes', 'macroareas']:
                    mds = list(ds.iter_cldf_metadata())
                    if col == 'lexemes':
                        row.append(
                            sum(md.notes['stats']['lexeme_count']
                                for md in mds))
                    elif col == 'macroareas':
                        mas = set()
                        for md in mds:
                            mas = mas.union(md.notes['stats']['macroareas'])
                        row.append(', '.join(sorted(mas)))

            table.append(row)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False))
예제 #5
0
 def known_license(self):
     if self.license:
         return licenses.find(self.license)
예제 #6
0
def ls(args):
    """
    lexibank ls [COLS]+

    column specification:
    - license
    - lexemes
    - macroareas
    """
    db = Database(args.db)
    db.create(exists_ok=True)
    in_db = {
        r[0]: r[1]
        for r in db.fetchall('select id, version from dataset')
    }
    # FIXME: how to smartly choose columns?
    table = Table('ID', 'Title')
    cols = OrderedDict([(col, {}) for col in args.args if col in [
        'version',
        'location',
        'changes',
        'license',
        'all_lexemes',
        'lexemes',
        'concepts',
        'languages',
        'families',
        'varieties',
        'macroareas',
    ]])
    tl = 40
    if cols:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)

    for col, sql in [
        ('languages', 'glottocodes_by_dataset'),
        ('concepts', 'conceptsets_by_dataset'),
        ('lexemes', 'mapped_lexemes_by_dataset'),
        ('all_lexemes', 'lexemes_by_dataset'),
        ('macroareas', 'macroareas_by_dataset'),
        ('families', 'families_by_dataset'),
    ]:
        if col in cols:
            cols[col] = {r[0]: r[1] for r in db.fetchall(sql)}
    for ds in args.cfg.datasets:
        row = [
            colored(ds.id, 'green' if ds.id in in_db else 'red'),
            truncate_with_ellipsis(ds.metadata.title or '', width=tl),
        ]
        for col in cols:
            if col == 'version':
                row.append(git_hash(ds.dir))
            elif col == 'location':
                row.append(colored(str(ds.dir), 'green'))
            elif col == 'changes':
                row.append(ds.git_repo.is_dirty())
            elif col == 'license':
                lic = licenses.find(ds.metadata.license or '')
                row.append(lic.id if lic else ds.metadata.license)
            elif col in [
                    'languages', 'concepts', 'lexemes', 'all_lexemes',
                    'families'
            ]:
                row.append(float(cols[col].get(ds.id, 0)))
            elif col == 'macroareas':
                row.append(', '.join(
                    sorted((cols[col].get(ds.id) or '').split(','))))
            else:
                row.append('')

        table.append(row)
    totals = ['zztotal', len(args.cfg.datasets)]
    for i, col in enumerate(cols):
        if col in ['lexemes', 'all_lexemes']:
            totals.append(sum([r[i + 2] for r in table]))
        elif col == 'languages':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct glottocode) FROM languagetable")
                    [0]))
        elif col == 'concepts':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct concepticon_id) FROM parametertable"
                    )[0]))
        elif col == 'families':
            totals.append(
                float(
                    db.fetchone(
                        "SELECT count(distinct family) FROM languagetable")
                    [0]))
        else:
            totals.append('')
    table.append(totals)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False,
                     floatfmt=',.0f'))
예제 #7
0
def test_find():
    assert find('http://creativecommons.org/licenses/by/4.0').id == 'CC-BY-4.0'
    assert find(
        'CC-BY-4.0').url == 'https://creativecommons.org/licenses/by/4.0/'
예제 #8
0
def test_legalcode():
    assert find('cc-by-4.0').legalcode
    assert find('Zlib').legalcode is None
예제 #9
0
def test_find():
    from clldutils.licenses import find

    assert find('http://creativecommons.org/licenses/by/4.0').id == 'CC-BY-4.0'
    assert find('CC-BY-4.0').url == 'https://creativecommons.org/licenses/by/4.0/'
예제 #10
0
def main(args):  # pragma: no cover
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')
    clts = CLTS(
        input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data')
    data = Data()
    ds = data.add(
        common.Dataset,
        vanuatuvoices.__name__,
        id=vanuatuvoices.__name__,
        name='Vanuatu Voices',
        domain='vanuatuvoices.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg')

    r = get_dataset('vanuatuvoices', ep='lexibank.dataset')
    authors, _ = r.get_creators_and_contributors()
    for ord, author in enumerate(authors):
        cid = slug(HumanName(author['name']).last)
        img = pathlib.Path(
            vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid)
        c = data.add(
            common.Contributor,
            cid,
            id=cid,
            name=author['name'],
            description=author.get('description'),
            jsondata=dict(img=img.name if img.exists() else None),
        )
    data.add(
        common.Contributor,
        'forkel',
        id='forkel',
        name='Robert Forkel',
        description='Data curation and website implementation',
        jsondata=dict(img=None),
    )
    for ord, cid in enumerate(['walworth', 'forkel', 'gray']):
        DBSession.add(
            common.Editor(ord=ord,
                          dataset=ds,
                          contributor=data['Contributor'][cid]))

    contribs = collections.defaultdict(lambda: collections.defaultdict(list))
    for c in args.cldf.iter_rows('contributions.csv'):
        for role in ['phonetic_transcriptions', 'recording', 'sound_editing']:
            for name in c[role].split(' and '):
                if name:
                    cid = slug(HumanName(name).last)
                    contribs[c['Language_ID']][cid].append(role)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        contrib = data.add(
            common.Contribution,
            lang['id'],
            id=lang['id'],
            name='Wordlist for {}'.format(lang['name']),
        )
        if lang['id'] in contribs:
            for cid, roles in contribs[lang['id']].items():
                DBSession.add(
                    common.ContributionContributor(
                        contribution=contrib,
                        contributor=data['Contributor'][cid],
                        jsondata=dict(roles=roles),
                    ))
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            contribution=contrib,
            island=lang['Island'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            description=param['Bislama_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    inventories = collections.defaultdict(collections.Counter)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        inventories[form['languageReference']].update(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=data['Contribution'][form['languageReference']],
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(Counterpart,
                 form['id'],
                 id=form['id'],
                 name=form['form'],
                 valueset=vs,
                 audio=form2audio.get(form['id']))

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv
                       if getattr(c, 'name', None)])
예제 #11
0
def run(args):
    db = get_db(args)
    in_db = {r[0]: r[1] for r in db.fetchall('select id, version from dataset')}

    table = Table('ID', 'Title')
    cols = collections.OrderedDict([
        (col, {}) for col in COLS if getattr(args, col, None) or args.all])
    tl = 40
    if cols:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)

    for col, sql in [
        ('languages', 'glottocodes_by_dataset'),
        ('concepts', 'conceptsets_by_dataset'),
        ('lexemes', 'mapped_lexemes_by_dataset'),
        ('all_lexemes', 'lexemes_by_dataset'),
        ('macroareas', 'macroareas_by_dataset'),
        ('families', 'families_by_dataset'),
    ]:
        if col in cols:
            cols[col] = {r[0]: r[1] for r in db.fetchall(sql)}
    datasets = get_datasets(args)
    for ds in datasets:
        row = [
            termcolor.colored(ds.id, 'green' if ds.id in in_db else 'red'),
            textwrap.shorten(ds.metadata.title or '', width=tl),
        ]
        for col in cols:
            if col == 'version':
                row.append(ds.repo.hash())
            elif col == 'location':
                row.append(termcolor.colored(str(ds.dir), 'green'))
            elif col == 'changes':
                row.append(ds.repo.is_dirty())
            elif col == 'license':
                lic = licenses.find(ds.metadata.license or '')
                row.append(lic.id if lic else ds.metadata.license)
            elif col in ['languages', 'concepts', 'lexemes', 'all_lexemes', 'families']:
                row.append(float(cols[col].get(ds.id, 0)))
            elif col == 'macroareas':
                row.append(', '.join(sorted((cols[col].get(ds.id) or '').split(','))))
            else:
                row.append('')

        table.append(row)
    totals = ['zztotal', len(datasets)]
    for i, col in enumerate(cols):
        if col in ['lexemes', 'all_lexemes']:
            totals.append(sum([r[i + 2] for r in table]))
        elif col == 'languages':
            totals.append(float(db.fetchone(
                "SELECT count(distinct glottocode) FROM languagetable")[0]))
        elif col == 'concepts':
            totals.append(float(db.fetchone(
                "SELECT count(distinct concepticon_id) FROM parametertable")[0]))
        elif col == 'families':
            totals.append(float(db.fetchone(
                "SELECT count(distinct family) FROM languagetable")[0]))
        else:
            totals.append('')
    table.append(totals)
    print(table.render(
        tablefmt='simple', sortkey=lambda r: r[0], condensed=False, floatfmt=',.0f'))
예제 #12
0
파일: initializedb.py 프로젝트: clld/ldh
def _main(data, glottolog):
    languoids = list(glottolog.languoids())
    lbyi = {l.iso: l for l in languoids if l.iso}

    dataset = common.Dataset(
        id='ldh',
        name='Language Description Heritage',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='ldh.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)

    DBSession.add(
        common.Editor(dataset=dataset,
                      contributor=common.Contributor(id='forkel',
                                                     name='Robert Forkel')))

    ls = set()
    for post in iter_posts():
        if post.pure_item_id:
            item = pure.Item.from_json(post.pure_item_id)
            src = data['Description'].get(item.id)
            if not src:
                src = data.add(
                    models.Description,
                    item.id,
                    id=item.id,
                    description=item.title,
                    name=item.name,
                    bibtex_type=EntryType.get(item.bibtex_type),
                    year=item.year,
                    title=item.title,
                    address=item.publisher.get('place')
                    if item.publisher else None,
                    publisher=item.publisher.get('publisher')
                    if item.publisher else None,
                    author=' and '.join(item.authors),
                    editor=' and '.join(item.editors),
                    pid=item.doi or item.pid,
                    pid_type='doi' if item.doi else 'hdl',
                )
                DBSession.flush()
                for file in item.files:
                    if file.visibility == 'PUBLIC' \
                            and file.metadata["contentCategory"] == "any-fulltext"\
                            and file.storage == 'INTERNAL_MANAGED':
                        assert file.mimeType == 'application/pdf'
                        DBSession.add(
                            common.Source_files(
                                id=file.pid.replace('/', '__'),
                                name=file.name,
                                object_pk=src.pk,
                                mime_type=file.mimeType,
                                jsondata=dict(size=file.size,
                                              license=attr.asdict(file.license)
                                              if file.license else None),
                            ))
            for iso in item.isocodes:
                if iso in lbyi:
                    gl = lbyi[iso]
                    l = data['LDHLanguage'].get(iso)
                    if not l:
                        l = data.add(models.LDHLanguage,
                                     iso,
                                     id=iso,
                                     name=gl.name)
                    DBSession.flush()
                    if (item.id, iso) not in ls:
                        DBSession.add(
                            common.LanguageSource(language_pk=l.pk,
                                                  source_pk=src.pk))
                        ls.add((item.id, iso))

    for item in zenodo.iter_items():
        src = data.add(
            models.Description,
            item.id,
            id=item.id,
            description=item['metadata']['title'],
            name=item.name,
            bibtex_type=EntryType.get(item.bibtex_type),
            year=item.year,
            title=item['metadata']['title'],
            publisher='Zenodo',
            author=' and '.join(a['name']
                                for a in item['metadata']['creators']),
            pid=item['metadata']['doi'],
            pid_type='doi',
        )
        DBSession.flush()
        for file in item['files']:
            license = licenses.find(item['metadata']['license']['id'])
            DBSession.add(
                common.Source_files(
                    id=file['checksum'].replace('md5:', ''),
                    name=file['key'],
                    object_pk=src.pk,
                    mime_type='application/' + file['type'],
                    jsondata=dict(
                        size=file['size'],
                        url=file['links']['self'],
                        license=attr.asdict(license) if license else None),
                ))

        for kw in item['metadata']['keywords']:
            if not kw.startswith('iso:'):
                continue
            iso = kw.replace('iso:', '')
            if iso in lbyi:
                gl = lbyi[iso]
                l = data['LDHLanguage'].get(iso)
                if not l:
                    l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name)
                DBSession.flush()
                if (item.id, iso) not in ls:
                    DBSession.add(
                        common.LanguageSource(language_pk=l.pk,
                                              source_pk=src.pk))
                    ls.add((item.id, iso))

    load_families(data,
                  data['LDHLanguage'].values(),
                  glottolog_repos=glottolog.repos,
                  isolates_icon='tcccccc')
예제 #13
0
파일: pure.py 프로젝트: clld/ldh
 def license(self):
     return find((self.metadata.get('license') or '').strip())
예제 #14
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    data = Data()
    ds = data.add(
        common.Dataset,
        mixezoqueanvoices.__name__,
        id=mixezoqueanvoices.__name__,
        name="Mixe-Zoquean Voices",
        domain='mixezoqueanvoices.clld.org',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )
    data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic')
    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    DBSession.add(
        common.ContributionContributor(
            contribution=contrib,
            contributor=data['Contributor']['kondic'],
        ))
    for i, ed in enumerate(['kondic', 'gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    ancestors = collections.defaultdict(list)
    gl = Glottolog(args.glottolog)
    lnames = {}
    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        lnames[lang['id']] = lang['name']
        glang = None
        if lang['glottocode']:
            glang = gl.languoid(lang['glottocode'])
            lineage = [i[0] for i in glang.lineage]
            if 'Mixe-Zoque' in lineage:
                ancestors[lang['id']].append('Protomixezoque')
            if 'Mixe' in lineage:
                ancestors[lang['id']].append('Protomixe')
            if 'Oaxaca Mixe' in lineage:
                ancestors[lang['id']].append('Protooaxacamixe')
        if not glang:
            assert lang['name'] == 'Nizaviguiti'
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            subgroup=glang.lineage[1][0]
            if glang and len(glang.lineage) > 1 else None,
        )
    colors = dict(
        zip(
            set(l.subgroup for l in data['Variety'].values()),
            qualitative_colors(
                len(set(l.subgroup for l in data['Variety'].values())))))
    for l in data['Variety'].values():
        l.jsondata = dict(color=colors[l.subgroup].replace('#', ''))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    # Store proto-forms for later lookup:
    proto_forms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference'):
        if form['languageReference'].startswith('Proto'):
            proto_forms[form['languageReference']][
                form['parameterReference']].append(form['form'])

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        proto = collections.OrderedDict()
        for lid, forms in proto_forms.items():
            f = forms.get(param['id'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            description=param['Spanish_Gloss'],
            jsondata=dict(reconstructions=proto),
        )

    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        assert not (form['form'] == '►' and not f2a.get(form['id']))
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        proto = collections.OrderedDict()
        for lid in ancestors.get(form['languageReference'], []):
            f = proto_forms[lid].get(form['parameterReference'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
            jsondata=dict(reconstructions=proto),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
예제 #15
0
def main(args):
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        papuanvoices.__name__,
        id=papuanvoices.__name__,
        domain='papuanvoices.clld.org',
        name="Papuan Voices",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    for i, ed in enumerate(['gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            description=lang['LongName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )