Ejemplo n.º 1
0
def test_Reference():
    ref = Reference(Source('book', 'huber2005', author='Herrmann Huber'),
                    '2-5')
    assert '2-5' in repr(ref)
    assert '%s' % ref == 'huber2005[2-5]'
    with pytest.raises(ValueError):
        Reference(Source('book', 'huber2005', author='Herrmann Huber'),
                  '[2-5]')
Ejemplo n.º 2
0
def test_from_entry():
    e = Entry('book', fields={'title': 'Title'})
    assert Source.from_entry('abc', e)['title'] == 'Title'

    with pytest.raises(ValueError):
        Source.from_entry('a.b', e)

    assert Source.from_entry('a.b', e, _check_id=False).id == 'a.b'
Ejemplo n.º 3
0
def bibdata(sheet, values, e, lgks, unresolved):
    def clean_key(key):
        return key.replace(':', '_').replace("'", "")

    for row in values:
        if row.Source:
            row.Source_comment = row.Source
            refs, sources = collections.OrderedDict(), []
            uc = sum(list(unresolved.values()))
            res = srctok.source_to_refs(row.Source, sheet.glottocode, e, lgks,
                                        unresolved)
            if sum(list(unresolved.values())) > uc:  # pragma: no cover
                row.Source_comment += ' (source not confirmed)'
            for key, pages in res[0]:
                typ, fields = e[key]
                ref = key = clean_key(key)
                if ref not in refs:
                    refs[ref] = set()
                refs[ref] = refs[ref].union(pages or [])
                sources.append(Source(typ, key, **fields))

            row.Source = [
                '{}{}'.format(r,
                              '[{}]'.format(','.join(sorted(p))) if p else '')
                for r, p in refs.items()
            ]
            for src in sources:
                yield src
Ejemplo n.º 4
0
def test_Sources(tmpdir):
    src = Sources()
    src.add(BIB, Source(
        'book', 'huber2005', author='Herrmann Huber', year='2005', title='y'))
    for entry in src:
        assert entry.genre == 'book'
        break
    assert len(list(src.items())) == 3
    assert len(list(src.keys())) == 3
    refs = ['huber2005[1-6]', 'Obrazy', 'Elegie[34]']
    assert src.format_refs(*list(src.expand_refs(refs))) == refs
    assert '%s' % src['huber2005'] == 'Huber, Herrmann. 2005. y.'
    with pytest.raises(ValueError):
        src.add(5)
    with pytest.raises(ValueError):
        src.add('@misc{a.b,\n  author="a.b"\n}')
    with pytest.raises(ValueError):
        _ = src['unknown']
        assert _  # pragma: no cover
    with pytest.raises(ValueError):
        src.parse('a[x')
    with pytest.raises(ValueError):
        src.parse('[x]')
    with pytest.raises(ValueError):
        src.validate(['x'])

    bib = str(tmpdir / 'test.bib')
    src.write(bib)

    src2 = Sources()
    src2.read(bib)

    src2.write(bib, ids=['huber2005'])
    src = Sources.from_file(bib)
    assert len(src) == 1
Ejemplo n.º 5
0
    def to_cldf(self, dest, mdname='cldf-metadata.json', coordinate_precision=4):
        """
        Write the data from the db to a CLDF dataset according to the metadata in `self.dataset`.

        :param dest:
        :param mdname:
        :return: path of the metadata file
        """
        dest = pathlib.Path(dest)
        if not dest.exists():
            dest.mkdir()

        data = self.read()

        if data[self.source_table_name]:
            sources = Sources()
            for src in data[self.source_table_name]:
                sources.add(Source(
                    src['genre'],
                    src['id'],
                    **{k: v for k, v in src.items() if k not in ['id', 'genre']}))
            sources.write(dest / self.dataset.properties.get('dc:source', 'sources.bib'))

        for table_type, items in data.items():
            try:
                table = self.dataset[table_type]
                items = [
                    self.round_geocoordinates(item, precision=coordinate_precision)
                    for item in items]
                table.common_props['dc:extent'] = table.write(
                    [self.retranslate(table, item) for item in items],
                    base=dest)
            except KeyError:
                assert table_type == self.source_table_name, table_type
        return self.dataset.write_metadata(dest / mdname)
Ejemplo n.º 6
0
def parse_source(src):
    sid = src.get("source-id")
    if _get_text(src, "type") == "Informant":
        kw = dict(howpublished="Informant: {0}".format(_get_text(src, "full-citation")))
    else:
        kw = dict(howpublished=_get_text(src, "full-citation"), note=_get_text(src, "note"))
    kw["key"] = _get_text(src, "citation-key")
    return Source("misc", sid, **kw)
Ejemplo n.º 7
0
 def itersources(self, pkmap):
     for row in self.raw_dir.read_csv('source.csv', dicts=True):
         jsondata = json.loads(row.pop('jsondata', '{}') or '{}')
         pkmap['source'][row.pop('pk')] = row['id']
         row['title'] = row.pop('description')
         row['key'] = row.pop('name')
         if (not row['url']) and jsondata.get('gbs', {}).get('id'):
             row['url'] = 'https://books.google.de/books?id=' + jsondata[
                 'gbs']['id']
         yield Source(row.pop('bibtex_type'), row.pop('id'), **row)
Ejemplo n.º 8
0
def test_field_order(tmpdir):
    srcs = Sources()
    src = Source('misc', 'x')  # src is an OrderedDict and we add title *after* year.
    src['year'] = '2018'
    src['title'] = 'The Title'
    srcs.add(src)
    bib = tmpdir / 'test.bib'
    srcs.write(str(bib))
    res = bib.read_text(encoding='utf8')
    # Still, title should be printed in the BibTeX before year:
    assert res.index('title =') < res.index('year =')
Ejemplo n.º 9
0
def test_Source_expand_refs():
    sources = Sources()
    src = Source(
        'book', 'Meier2005', author='Hans Meier', year='2005', title='The Book')
    assert 'Meier2005' in repr(src)
    sources.add(src)
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('Meier2005'))) == 1
    bib = sources._bibdata.to_string(bib_format='bibtex')
    assert len(bib.split('author')) == 2
    assert len(list(sources.expand_refs('12345'))) == 1
Ejemplo n.º 10
0
def get_reference(author, year, title, pages, sources, id_=None, genre='misc'):
    kw = {'title': title}
    id_ = id_ or None
    if author and year:
        id_ = id_ or slug(author + year)
        kw.update(author=author, year=year)
    elif title:
        id_ = id_ or slug(title)

    if not id_:
        return

    source = sources.get(id_)
    if source is None:
        sources[id_] = source = Source(genre, id_, **kw)

    return Reference(source, pages)
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.tsv', delimiter="\t", dicts=True)

        # Quite a hack to allow things like "1995.pdfb" as Source IDs:
        bib = pybtex.database.parse_string(self.raw_dir.read('sources.bib'), bib_format='bibtex')
        sources = []
        for k, e in bib.entries.items():
            # Unfortunately, Source.from_entry does not allow any keyword arguments to be passed
            # to the constructor, see https://github.com/cldf/pycldf/issues/99
            e.fields['_check_id'] = False
            sources.append(Source.from_entry(k, e))
        args.writer.add_sources(*sources)

        language_lookup = args.writer.add_languages(lookup_factory='NameInSource')
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split('-')[-1]+'_'+slug(x.english),
            lookup_factory='Name'
        )
        lang_sources = {l['NameInSource']: l['Source'].split(",") for l in self.languages}

        # remap concepts for personal pronouns
        remap_concepts = {
            '1SG pronoun': '1sg pronoun',
            '2SG pronoun': '2sg pronoun',
            '3SG pronoun': '3sg pronoun',
        }

        for line_dict in progressbar(data, desc='cldfify'):
            concept = line_dict['Meaning']
            concept_id = concept_lookup.get(remap_concepts.get(concept, concept))
            for language, language_id in language_lookup.items():
                value = line_dict[language].strip()
                if value:
                    args.writer.add_form(
                        Value=value,
                        Form=value,
                        Parameter_ID=concept_id,
                        Language_ID=language_id,
                        Source=lang_sources[language]
                    )
Ejemplo n.º 12
0
def cldf(dataset, concepticon, **kw):
    concept_map = {
        int(c['GLOSS']): c['CONCEPTICON_ID'] or None
        for c in dataset.concepts
    }

    gc_pattern = re.compile('[a-z0-9]{4}[1-9][0-9]{3}$')
    meta = {}
    for row in read_csv(dataset, 'META'):
        meta[(row[5], row[9])] = dict(
            zip(
                'NAME,COUNTRY,ISO,GLOTTO_NAME,GLOTTO_CODE,LG_LINK,AUDIO,SOURCE,NR_SETS,VARIANT'
                .lower().split(','), row))

    sources = {}
    sid = 0
    for spec in meta.values():
        if spec['source'] and spec['source'] not in sources:
            sid += 1
            sources[spec['source']] = Source('misc',
                                             's%s' % sid,
                                             title=spec['source'])

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Source',
            'Comment',
    ), dataset) as ds:
        for key, items in groupby(
                sorted(read_csv(dataset, 'NUMERAL'),
                       key=lambda r: (r[2], r[3], r[0])), lambda r:
            (r[2], r[3])):
            if key not in meta:
                continue
            if int(float(key[1])) > 1:
                continue
            md = meta[key]
            source, ref = sources.get(md['source']), None
            if source:
                ds.sources.add(source)
                ref = source.id
            if gc_pattern.match(md['glotto_code']):
                for concept, rows in groupby(items, lambda k: k[0]):
                    if not concept.endswith('.0'):
                        continue
                    iconcept = int(float(concept))
                    if iconcept not in concept_map:
                        unmapped.concepts.add((iconcept, iconcept))
                    for k, row in enumerate(rows):
                        ds.add_row([
                            '%s-%s-%s' % (lgid(row[2]), iconcept, k + 1),
                            md['glotto_code'],
                            md['name'],
                            concept_map.get(iconcept),
                            '%s' % iconcept,
                            row[1],
                            ref,
                            row[4] or None,
                        ])
    unmapped.pprint()
Ejemplo n.º 13
0
    def to_cldf(self, ds, concepts):
        """
        :param ds: the dataset object
        :concepts: a dictionary mapping concept labels to concept ids

        :return: A dataset object, ds.
        """
        source = []
        if self.language.source:
            bib = parse_string(self.language.source, "bibtex")
            try:
                ds.add_sources(
                    *[Source.from_entry(k, e) for k, e in bib.entries.items()])
                source = list(bib.entries.keys())
            except:  # noqa: E722
                self.log.warning("Invalid citekey for %s" % self.language.id)

        ds.add_language(ID=self.language.id,
                        Glottocode=self.language.glottocode,
                        ISO639P3code=self.language.iso,
                        Name=self.language.name,
                        author=self.language.author,
                        url=self.url('language.php?id=%s' % self.language.id),
                        typedby=self.language.typedby,
                        checkedby=self.language.checkedby,
                        notes=self.language.notes,
                        source=";".join(source))

        for entry in self.entries:
            if entry.name is None or len(
                    entry.name) == 0:  # skip empty entries
                continue  # pragma: no cover

            # skip entries marked as incorrect word form due to semantics
            # (x = probably, s = definitely)
            if entry.cognacy and entry.cognacy.lower() in ('s', 'x'):
                continue  # pragma: no cover

            # handle concepts
            cid = concepts.get(entry.word_id)
            if not cid:
                self.dataset.unmapped.add_concept(ID=entry.word_id,
                                                  Name=entry.word)
                # add it if we don't have it.
                ds.add_concept(ID=entry.word_id, Name=entry.word)
                cid = entry.word_id

            # handle lexemes
            try:
                lex = ds.add_forms_from_value(
                    Local_ID=entry.id,
                    Language_ID=self.language.id,
                    Parameter_ID=cid,
                    Value=entry.name,
                    # set source to entry-level sources if they exist, otherwise use
                    # the language level source.
                    Source=[entry.source] if entry.source else source,
                    Cognacy=entry.cognacy,
                    Comment=entry.comment or '',
                    Loan=True if entry.loan and len(entry.loan) else False,
                )
            except:  # NOQA: E722; pragma: no cover
                print("ERROR with %r -- %r" % (entry.id, entry.name))
                raise

            if lex:
                for cognate_set_id in entry.cognates:
                    match = self.dataset.cognate_pattern.match(cognate_set_id)
                    if not match:  # pragma: no cover
                        self.log.warning(
                            'Invalid cognateset ID for entry {0}: {1}'.format(
                                entry.id, cognate_set_id))
                    else:
                        # make global cognate set id
                        cs_id = "%s-%s" % (slug(entry.word), match.group('id'))

                        ds.add_cognate(lexeme=lex[0],
                                       Cognateset_ID=cs_id,
                                       Doubt=bool(match.group('doubt')),
                                       Source=['Greenhilletal2008'] if
                                       self.section == 'austronesian' else [])

        return ds
Ejemplo n.º 14
0
 def read_bib(self, fname='sources.bib'):
     bib = database.parse_string(self.read(fname), bib_format='bibtex')
     return [Source.from_entry(k, e) for k, e in bib.entries.items()]
Ejemplo n.º 15
0
def cldf(dataset, concepticon, **kw):
    data = get_all(dataset)
    gl_map = {k: v.id for k, v in dataset.glottolog_languoids.items()}
    gl_map.update(dataset.glottocode_by_iso)

    swadesh_concepts = {
        k: v
        for k, v in data['word'].items() if v['id'] in data['concept_ids']
    }

    def normalized_gloss(gloss):
        if gloss.startswith('to '):
            gloss = gloss[3:].strip()
        if '/' in gloss:
            gloss = gloss.split('/')[0].strip()
        if '(' in gloss:
            gloss = gloss.split('(')[0].strip()
        if gloss.endswith('?'):
            gloss = gloss[:-1]
        return gloss

    swadesh2concepticon = {
        'right (hand)': '2183',
        'we incl. (pronoun d:1p, incl)': '1131',
        'left (hand)': '2182',
        'right (correct, true)': '1725',
        'in, inside': '1460',
        'to lie down': '215',
    }
    for conceptlist in [
            'Swadesh-1960-200', 'Swadesh-1971-100', 'Swadesh-1955-100',
            'Swadesh-1950-215', 'Swadesh-1955-215'
    ]:
        for d in concepticon.conceptlists[conceptlist].concepts.values():
            swadesh2concepticon.setdefault(d.english, d.concepticon_id)

    concept_map = {}
    for concept in swadesh_concepts.values():
        gloss = normalized_gloss(concept['word'])
        if gloss in swadesh2concepticon:
            concept_map[concept['id']] = swadesh2concepticon[gloss]
        elif concept['word'] in swadesh2concepticon:
            concept_map[concept['id']] = swadesh2concepticon[concept['word']]
        else:
            raise ValueError(concept['word'])
    assert len(concept_map) == len(set(concept_map.values()))

    for c in dataset.concepts:
        if c['CONCEPTICON_ID']:
            concept_map[int(c['ID'])] = c['CONCEPTICON_ID'] or None

    uc = Counter()
    unmapped = Unmapped(lambda r: int(r[0]))
    for language_url, words in groupby(
            sorted(data['lexicon'].values(), key=lambda i: i['language']),
            lambda i: i['language']):
        contribution = data['language'][language_url]
        with CldfDataset((
                'ID',
                'Language_ID',
                'Language_iso',
                'Language_name',
                'Language_local_ID',
                'Parameter_ID',
                'Parameter_name',
                'Parameter_local_ID',
                'Value',
                'Source',
                'Cognate_Set',
                'Comment',
                'Loan',
        ),
                         dataset,
                         subset=contribution['id']) as ds:
            cname = contribution['language']
            if contribution['dialect']:
                cname += ' (%s Dialect)' % contribution['dialect']
            lid = gl_map.get(contribution['glottocode'])
            if not lid:
                lid = gl_map.get(contribution['isocode'])
                if not lid:
                    unmapped.languages.add(
                        (contribution['id'], cname, contribution['isocode']))
            if contribution['information']:
                ds.metadata['dc:description'] = contribution['information']

            ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                '%s/word/{Parameter_local_ID}' % BASE_URL
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                '%s/language/{Language_local_ID}' % BASE_URL

            for word in words:
                concept = data['word'][word['word']]
                if concept['id'] not in concept_map:
                    unmapped.concepts.add((concept['id'], concept['word']))
                    uc.update([concept['word']])
                src = data['source'].get(word['source'])
                if src:
                    ds.sources.add(
                        Source('misc',
                               src['slug'],
                               author=src['author'],
                               year=src['year'],
                               transnewguinea_id=BASE_URL + '/source/' +
                               src['slug'],
                               title=src['reference']))
                ds.add_row([
                    word['id'],
                    lid,
                    contribution['isocode'],
                    cname,
                    contribution['slug'],
                    concept_map.get(concept['id']),
                    concept['word'],
                    concept['slug'],
                    word['entry'],
                    src['slug'] if src else None,
                    None,
                    word['annotation'],
                    word['loan'],
                ])
    unmapped.pprint()
Ejemplo n.º 16
0
    def cmd_makecldf(self, args):
        asjp = ASJP(self.raw_dir)

        meaning_id_lookup = {v: k for k, v in MEANINGS_ALL.items()}
        meaning_id_lookup["breasts"] = meaning_id_lookup["breast"]

        for concept in sorted(
                self.conceptlists[0].concepts.values(),
                key=lambda c: meaning_id_lookup[c.label.replace("*", "")],
        ):
            args.writer.add_concept(
                ID=meaning_id_lookup[concept.label.replace("*", "")],
                Name=concept.label,
                Concepticon_ID=concept.concepticon_id,
                Concepticon_Gloss=concept.concepticon_gloss,
            )

        iso2gl = {l.iso: l.id for l in self.glottolog.languoids() if l.iso}

        # Correct glottocodes (mapping, retired ones, etc.)
        iso2gl.update({
            "gtu": "aghu1254",  # from aghu1256, AGHU_THARRNGGALA
            "xss": "kott1239",  # from assa1266, ASSAN
            "bbz":
            "char1283",  # from baba1273, BABALIA_CREOLE_ARABIC (mapping to a dialect of
            # Chadian Arabic, as per SIL documentation)
            "bcb": "bain1259",  # from bain1260, BAYNUNK_GUJAXER*
            "bic": "biso1243",  # from bika1251, BIKARU*
            "bvp": "khan1274",  # from bika1247, BUMANG
            "geg": "kuga1239",  # from geng1243, GENGLE
            "ihi": "emai1241",  # from ihie1238, IHIEE
            "kav": "kana1291",  # from nucl1668, KATUKINA
            "kym": "gbay1278",  # from kpat1244, KPATILI_2
            "mct": "eton1253",  # from meng1263, MENGISA
            "nxu": "kaur1271",  # from nara1265, NARAU
            "nmj": "gund1247",  # from ngom1265, NGOMBE_2
            "nom": "cash1251",  # from noca1240, NOCAMAN
            "otk": "oldu1238",  # from oldt1247, OLD_TURKIC
            "rui": "nden1248",  # from rufi1234, RUFIJI
            "sxm": "somr1240",  # from samr1245, SAMRE
            "xin":
            "xinc1242",  # from xinc1247, mapping to what is by far the most plausible source
            # for Lehmann 1920 (all other varieties were extinct, and the
            # alternative one is dormant and close enough)
        })

        lids = set()
        for doculect in sorted(asjp.iter_doculects(), key=lambda dl: dl.id):
            lid = slug(doculect.id)
            assert lid not in lids, doculect.id
            lids.add(lid)
            sources = asjp.source(doculect) or []
            sources = [
                src for src in sources
                if src.author or src.year or src.title_etc
            ]
            for src in sources:
                args.writer.add_sources(
                    Source("misc",
                           str(src.id),
                           author=src.author,
                           year=src.year,
                           title=src.title_etc))

            # Obtain the glottocode mapping and fix it for cases with multiple mappings
            glottocode = iso2gl.get(doculect.code_iso)
            if lid == "LENCA_EL_SALVADOR":
                glottocode = "lenc1243"
            elif lid == "LENCA_HONDURAS":
                glottocode = "lenc1242"
            elif lid == "DANANSHAN_HMONG":
                glottocode = "hmon1332"
            elif lid == "SHIMENKAN_HMONG":
                glottocode = "smal1236"
            elif lid == "SUYONG_HMONG":
                glottocode = "larg1235"
            elif lid == "URADHI_ANGKAMUTHI":
                glottocode = "angg1238"
            elif lid == "URADHI_ATAMPAYA":
                glottocode = "atam1239"
            elif lid == "URADHI_YADHAYKENU":
                glottocode = "yadh1237"

            args.writer.add_language(
                ID=lid,
                Name=doculect.id,
                ISO639P3code=doculect.code_iso if re.fullmatch(
                    "[a-z]{3}", doculect.code_iso or "") else None,
                Glottocode=glottocode,
                Latitude=doculect.latitude,
                Longitude=doculect.longitude,
                classification_wals=doculect.classification_wals,
                classification_ethnologue=doculect.classification_ethnologue,
                classification_glottolog=doculect.classification_glottolog,
                recently_extinct=doculect.recently_extinct,
                long_extinct=doculect.long_extinct,
                year_of_extinction=doculect.year_of_extinction,
                code_wals=doculect.code_wals,
                code_iso=doculect.code_iso,
                transcribers=" and ".join(
                    sorted(
                        [tr.name for tr in asjp.transcriber(doculect) or []])),
            )
            args.writer.add_languages()
            for synset in sorted(doculect.synsets,
                                 key=lambda ss: ss.meaning_id):
                for word in synset.words:
                    args.writer.add_form(
                        Language_ID=lid,
                        Parameter_ID=synset.meaning_id,
                        Value=word.form,
                        Form=word.form,
                        Loan=word.loan,
                        Comment=synset.comment,
                        gloss_in_source=synset.meaning,
                        Source=sorted([str(src.id) for src in sources]),
                    )
Ejemplo n.º 17
0
    def cmd_makecldf(self, args):
        glottolog = args.glottolog.api

        contrib = ["""\
# Contributors

name | affiliation | orcid | github | role
---  | ----------- | ----- | ------ | ----"""]
        for ed in glottolog.current_editors:
            contrib.append(' | '.join(
                [ed.name, ed.affiliation, ed.orcid, '@' + getattr(ed, 'github'), 'maintainer']))
        self.dir.joinpath('CONTRIBUTORS.md').write_text('\n'.join(contrib), encoding='utf8')

        # write metadata.json
        dump(
            {
                "title": glottolog.publication.zenodo.title_format.format(
                    glottolog.publication.zenodo.version) + ' as CLDF',
                "citation": metadata.citation(glottolog),
                "url": glottolog.publication.web.url,
                "description": glottolog.publication.web.description,
                "id": "glottolog",
                "license": glottolog.publication.zenodo.license_id,
            },
            self.dir / 'metadata.json',
            indent=4,
        )

        ds = args.writer.cldf
        ds.add_provenance(
            wasGeneratedBy=repos('pyglottolog', version=pyglottolog.__version__),
        )
        t = ds.add_component(
            'ParameterTable',
            {
                'name': 'type',
                'datatype': {'base': 'string', 'format': 'categorical|sequential'},
                'dc:description': 'Describes the domain of the parameter',
                'default': None,
            },
            {
                'name': 'infoUrl',
                'dc:description': 'URL (relative to `aboutUrl`) of a web page with further '
                                  'information about the parameter',
                'aboutUrl': 'https://glottolog.org/{+infoUrl}',
            },
            {
                'name': 'datatype',
                'datatype': 'json',
                'dc:description':
                    'CSVW datatype description for values for this parameter. I.e. content of the '
                    'Value column of associated rows in ValueTable should be interpreted/parsed '
                    'accordingly',
            }
        )
        t.common_props['dc:description'] = \
            "This table lists parameters (or aspects) of languoids that Glottolog assigns values " \
            "for, such as the languoid's position on the Glottolog classification or the " \
            "descriptive status. Refer to the `Description` column in the table for details, and " \
            "to the `datatype` columnn for information how values for the parameter should be " \
            "interpreted."
        ds.add_component('CodeTable')
        ds.add_columns('ValueTable', 'codeReference')
        t = ds.add_component(
            'LanguageTable',
            {
                'name': 'Countries',
                'separator': ';',
                'dc:description':
                    'ISO 3166-1 alpha-2 country codes for countries a language is spoken in.',
                'propertyUrl': 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2',
                'aboutUrl': 'https://en.wikipedia.org/wiki/ISO_3166-2:{Countries}',
            },
            {
                'name': 'Family_ID',
                'dc:description':
                    'Glottocode of the top-level genetic unit, the languoid belongs to'},
            {
                'name': 'Language_ID',
                'dc:description':
                    'Glottocode of the language-level languoid, the languoid belongs to '
                    '(in case of dialects)'},
        )
        t.common_props['dc:description'] = \
            'This table lists all Glottolog languoids, i.e. families, languages and dialects ' \
            'which are nodes in the Glottolog classification - including "non-genealogical" ' \
            'trees as described at https://glottolog.org/glottolog/glottologinformation . ' \
            'Thus, assumptions about the properties of a languoid listed here should be made ' \
            'after including associated information from ValueTable, in particular for languoid ' \
            'level and category.'
        t.aboutUrl = 'https://glottolog.org/meta/glossary#Languoid'
        ds.add_foreign_key('LanguageTable', 'Family_ID', 'LanguageTable', 'ID')
        ds.add_foreign_key('LanguageTable', 'Language_ID', 'LanguageTable', 'ID')

        ds['LanguageTable', 'Macroarea'].separator = ';'
        ds['ValueTable', 'Value'].null = ['<NA>']

        data = args.writer.objects
        for pid, pinfo in schema.PARAMETER_INFO.items():
            data['ParameterTable'].append(dict(
                ID=pid,
                Name=pinfo.name,
                type=pinfo.type,
                Description=pinfo.description,
                infoUrl=pinfo.url,
                datatype=pinfo.dt.asdict()
            ))
        for level in glottolog.languoid_levels.values():
            data['CodeTable'].append(dict(
                ID='level-{0}'.format(level.name),
                Parameter_ID='level',
                Name=level.name,
                Description=level.description,
            ))
            data['CodeTable'].append(dict(
                ID='category-{0}'.format(level.name.capitalize()),
                Parameter_ID='category',
                Name=level.name.capitalize()))
            data['CodeTable'].append(dict(
                ID='category-Pseudo_{0}'.format(level.name.capitalize()),
                Parameter_ID='category',
                Name='Pseudo {}'.format(level.name.capitalize())))

        for el in sorted(glottolog.language_types.values()):
            data['CodeTable'].append(dict(
                ID='category-{0}'.format(el.category.replace(' ', '_')),
                Parameter_ID='category',
                Name=el.category))

        for el in sorted(glottolog.aes_status.values()):
            data['CodeTable'].append(dict(
                ID='aes-{0}'.format(el.name.replace(' ', '_')),
                Parameter_ID='aes',
                Name=el.name,
                Description='EGIDS: {0.egids}; UNESCO: {0.unesco}; ElCat: {0.unesco}'.format(el),
            ))

        for el in sorted(glottolog.med_types.values()):
            data['CodeTable'].append(dict(
                ID='med-{0}'.format(el.id),
                Parameter_ID='med',
                Name=el.name,
                Description=el.description,
            ))

        languoids = collections.OrderedDict((lang.id, lang) for lang in glottolog.languoids())
        refs_by_languoid, refs = glottolog.refs_by_languoid(languoids)

        def get_language_id(lang):
            if lang.level == glottolog.languoid_levels.dialect:
                for _, lid, _ in reversed(lang.lineage):
                    if languoids[lid].level == glottolog.languoid_levels.language:
                        return lid

        def format_ref(ref):
            return '{0}[{1}]'.format(ref.key, ref.pages.replace(';', ',')) if ref.pages else ref.key

        for lang in languoids.values():
            data['LanguageTable'].append(dict(
                ID=lang.id,
                Name=lang.name,
                Glottocode=lang.id,
                ISO639P3code=lang.iso,
                Latitude=lang.latitude,
                Longitude=lang.longitude,
                Macroarea=[ma.name for ma in lang.macroareas],
                Countries=[c.id for c in lang.countries],
                Family_ID=lang.lineage[0][1] if lang.lineage else None,
                Language_ID=get_language_id(lang),
            ))
            med = sorted(refs_by_languoid[lang.id], reverse=True)[0] \
                if lang.id in refs_by_languoid else None
            if med:
                ds.add_sources(Source(med.type, med.id, _check_id=False, **med.fields))
            clf = lang.classification_comment
            if clf:
                for ref in clf.merged_refs('family') + clf.merged_refs('sub'):
                    if ref.key not in refs:
                        args.log.warning('missing reference in classification comment: {0}'.format(ref))
                        continue
                    e = refs[ref.key]
                    ds.add_sources(Source(e.type, ref.key, _check_id=False, **e.fields))

            aes_src = lang.endangerment.source.reference_id if lang.endangerment else None
            if aes_src:
                e = refs[aes_src]
                ds.add_sources(Source(e.type, aes_src, _check_id=False, **e.fields))

            data['ValueTable'].extend([
                value(
                    lang.id,
                    'level',
                    lang.level.name,
                    Code_ID='level-{0}'.format(lang.level.name)),
                value(
                    lang.id,
                    'category',
                    lang.category.replace(' ', '_'),
                    Code_ID='category-{0}'.format(lang.category.replace(' ', '_')),
                ),
                value(
                    lang.id,
                    'classification',
                    schema.PARAMETER_INFO['classification'].dt.formatted(
                        '/'.join([li[1] for li in lang.lineage])) or None,
                    Source=[format_ref(ref) for ref in clf.merged_refs('family')] if clf else [],
                    Comment=clf.family if clf else None,
                ),
                value(
                    lang.id,
                    'subclassification',
                    lang.newick_node(nodes=languoids, template="{l.id}").newick + ';',
                    Source=[format_ref(ref) for ref in clf.merged_refs('sub')] if clf else [],
                    Comment=clf.sub if clf else None,
                ),
                value(
                    lang.id,
                    'aes',
                    lang.endangerment.status.ordinal if lang.endangerment else None,
                    Comment=lang.endangerment.comment if lang.endangerment else None,
                    Source=[aes_src] if aes_src else [],
                    Code_ID='aes-{0}'.format(lang.endangerment.status.name.replace(' ', '_'))
                    if lang.endangerment else None,
                ),
                value(
                    lang.id,
                    'med',
                    med.med_type.rank if med else None,
                    Source=[med.id] if med else [],
                    Code_ID='med-{0}'.format(med.med_type.id) if med else None,
                ),
            ])
Ejemplo n.º 18
0
def test_Wordlist_2(abvd_dataset):
    with abvd_dataset.cldf as ds:
        for wl in abvd_dataset.iter_wordlists({'1': 'bali1278'}, None):
            wl.to_cldf(ds, {}, citekey='x', source=[Source('a', 'b', **dict(title='t'))])
Ejemplo n.º 19
0
    def to_cldf(self,
                ds,
                concept_map,
                citekey=None,
                source=None,
                concept_key=None):
        if concept_key is None:

            def concept_key(entry):
                return entry.word_id

        ref = None
        if citekey and source:
            ref = citekey
            for r in ref.split(";"):
                for s in source:
                    if isinstance(s, Source):
                        ds.add_sources(s)
                    else:
                        ds.add_sources(Source('misc', r, title=s))

        ds.add_language(
            ID=self.language.id,
            Glottocode=self.language.glottocode,
            ISO639P3code=self.language.iso,
            Name=self.language.name,
            author=self.language.author,
            url=self.url('language.php?id=%s' % self.language.id),
            typedby=self.language.typedby,
            checkedby=self.language.checkedby,
            notes=self.language.notes,
        )

        for entry in self.entries:
            if entry.name is None or len(
                    entry.name) == 0:  # skip empty entries
                continue  # pragma: no cover

            if entry.cognacy and ('s' == entry.cognacy.lower()
                                  or 'x' in entry.cognacy.lower()):
                # skip entries marked as incorrect word form due to semantics
                # (x = probably, s = definitely)
                continue  # pragma: no cover

            if not (citekey and source):
                src = entry.e.find('source')
                if (src is not None) and getattr(src, 'text'):
                    ref = slug(str(src.text))
                    ds.add_sources(Source('misc', ref, title=src.text))
            cid = concept_map.get(concept_key(entry))
            if not cid:
                self.dataset.unmapped.add_concept(ID=entry.word_id,
                                                  Name=entry.word)

            ds.add_concept(ID=entry.word_id,
                           Name=entry.word,
                           Concepticon_ID=cid)
            for lex in ds.add_lexemes(
                    Language_ID=self.language.id,
                    Parameter_ID=entry.word_id,
                    Value=entry.name,
                    Source=[ref],
                    Cognacy=entry.cognacy,
                    Comment=entry.comment or '',
                    Loan=True if entry.loan and len(entry.loan) else False,
                    Local_ID=entry.id,
            ):
                for cognate_set_id in entry.cognates:
                    match = self.dataset.cognate_pattern.match(cognate_set_id)
                    if not match:  # pragma: no cover
                        self.log.warn('Invalid cognateset ID: {0}'.format(
                            cognate_set_id))
                    else:
                        ds.add_cognate(lexeme=lex,
                                       Cognateset_ID=match.group('id'),
                                       Doubt=bool(match.group('doubt')))
                # when an entry is split into multiple forms, we only assign cognate
                # sets to the first one!
                break

        return ds
Ejemplo n.º 20
0
def main(args):
    """ Update cognate codes and alignments of a CLDF dataset from an Edictor file.

    Parameters
    ----------
    args : Namespace
        A Namespace object with several properties listed below.
        edictor : FileType
            ...
        cldf : Path, optional
            Path to the CLDF metadata json file.
        source-id : str, optional
            String used to source any changes to cognate codes or alignments. This defaults
            to "edictor".
        cogid : str, optional
            String that specifies the header of the column containing the cognate
            set id's in the Edictor file. This defaults to "COGID".

    Notes
    -----
        Once this function is called with the proper arguments, cognates.csv in the CLDF
        dataset is updated based on the output of Edictor when changes are made to
        cognate codes or alignments.
        Sources.bib also gets updated with a new source if specified.
    """
    # Check CLDF argument, in order to fail early if this fails.
    dataset = pycldf.dataset.Wordlist.from_metadata(args.cldf)

    # Read new cognate classes and their alignments
    new_cognateset_assignments = {}
    alignments = {}
    for row in csv.DictReader(args.edictor, delimiter="\t"):
        if row["ID"].startswith("#") or not row["ID"] or not row["REFERENCE"]:
            # LingPy has comment rows
            continue
        new_cognateset_assignments[row["REFERENCE"]] = tuple(
            row[args.cogid].split())
        try:
            alignments[row["REFERENCE"]] = row["ALIGNMENT"].split()
        except AttributeError:
            print(row)
            raise
    new_cognatesets = swap(new_cognateset_assignments)

    # Column names are assumed to follow standard CLDF CognateTable conventions.
    original_rows = []
    data_on_form = {}
    official_cognateset_assignments = {}
    max_row_id = 0
    for r, row in enumerate(dataset["CognateTable"].iterdicts()):
        original_rows.append(row)
        data_on_form[row["Form_ID"]] = row
        official_cognateset_assignments[row["Form_ID"]] = row["Cognateset_ID"]
        max_row_id = max(max_row_id, row["ID"])
    official_cognatesets = swap(official_cognateset_assignments)

    # Find changed alignments
    for form, data in data_on_form.items():
        if data.get("Alignment", None) == alignments.get(form, False):
            del alignments[form]

    # Construct a set of minimal changes to update cognate sets
    pairs, overlaps = [], []

    # First, sort all pairs of old and new cognate sets by overlap.
    still_to_match = new_cognatesets.copy()
    for name, cognateset in list(official_cognatesets.items()):
        cognateset = cognateset.copy()
        for new_name, new_cognateset in list(still_to_match.items()):
            if cognateset == new_cognateset:
                # This cognate set has not changed, ignore.
                continue

            overlap = cognateset & new_cognateset
            if not overlap:
                # There is no overlap.
                continue

            # This cognate set has changed.
            official_cognatesets.pop(name, None)

            # Insert the pair into an ordered table of pair sizes. Ensure
            # biggest overlaps come first, so actually work with their negatives.
            index = bisect.bisect(overlaps, -len(overlap))
            overlaps.insert(index, -len(overlap))
            pairs.insert(index, (name, new_name))

            remainder = new_cognateset - overlap
            if remainder:
                still_to_match[new_name] = remainder
            else:
                # All entries of this set have been accounted for, remove it.
                del still_to_match[new_name]

            cognateset -= overlap
            if not cognateset:
                # All entries of this set have been accounted for, no need to look further.
                break

    # Now greedily assign new cognate class ids based on the old ones. (This
    # greedy algorithm is not optimal, but that issue should not be too
    # relevant.)
    other_seen = set()
    moved_forms = collections.OrderedDict()
    for index, (name, other) in enumerate(pairs):
        if other in other_seen:
            continue
        else:
            new_name = name
            while new_name in official_cognatesets:
                new_name = new_name + "X"
            official_cognatesets[new_name] = set()
            for form in new_cognatesets[other]:
                if official_cognateset_assignments.get(form) != new_name:
                    moved_forms[form] = new_name
                official_cognatesets[new_name].add(form)
            other_seen.add(other)

    for new_name, new_cognateset in list(still_to_match.items()):
        while new_name in official_cognatesets:
            new_name = new_name + "X"
        for form in new_cognateset:
            moved_forms[form] = new_name
        official_cognatesets[new_name] = new_cognateset

    try:
        source = dataset.sources[args.source_id]
    except ValueError:
        source = Source('misc',
                        id_=args.source_id,
                        year=str(datetime.date.today().year))
        dataset.sources.add(source)
        dataset.write_sources()

    def new_rows(defaults, last_row_id, moved_forms_p, realigned_forms,
                 source_p):
        # TODO: Make a docstring?
        # What is the benefit of defining this within the main function?
        t = type(last_row_id)
        i = -1  # FIXME: Is this assignment necessary?
        empty = {"Alignment": [], "Source": []}
        for i, (form_id,
                new_cognateset_it) in enumerate(moved_forms_p.items()):
            row_new = defaults.get(form_id, empty).copy()
            row_new["ID"] = last_row_id + t(i + 1)
            row_new["Form_ID"] = form_id
            row_new["Cognateset_ID"] = new_cognateset_it
            if form_id in realigned_forms:
                row_new["Alignment"] = realigned_forms[form_id]
                row_new["Source"] = [source.id]
            else:
                row_new["Alignment"] = defaults[form_id]["Alignment"]
                row_new["Source"] = defaults[form_id]["Source"] + [source_p.id]
            realigned_forms[form_id] = None
            print(row_new)
            yield row_new
        for j, (form_id, new_alignment) in enumerate(realigned_forms.items()):
            if not new_alignment:
                continue
            try:
                row_new = defaults[form_id].copy()
                row_new["ID"] = last_row_id + t(i + j + 2)
                row_new["Alignment"] = new_alignment
                if source.id not in row_new["Source"]:
                    row_new["Source"] = row_new["Source"] + [source.id]
            except KeyError:
                row_new = empty.copy()
                row_new["ID"] = last_row_id + t(i + j + 2)
                row_new["Form_ID"] = form_id
                row_new["Alignment"] = new_alignment
                row_new["Source"] = [source.id]
            print(row_new)
            yield row_new

    dataset["CognateTable"].write(
        itertools.chain(
            original_rows,
            new_rows(data_on_form, max_row_id, moved_forms, alignments,
                     source)))
Ejemplo n.º 21
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        pk2id = collections.defaultdict(dict)
        sources = parse_string(
            self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'),
            'bibtex')
        self.read('source', pkmap=pk2id)

        refs = []
        for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True):
            if row['source_pk']:
                refs.append(
                    (row['valueset_pk'], pk2id['source'][row['source_pk']],
                     row['description']))
        srcids = set(r[1] for r in refs)
        args.writer.cldf.add_sources(*[
            Source.from_entry(id_, e) for id_, e in sources.entries.items()
            if id_ in srcids
        ])

        contributors = self.read('contributor',
                                 pkmap=pk2id,
                                 key=lambda r: r['id'])
        for row in contributors.values():
            args.writer.objects['contributors.csv'].append({
                'ID': row['id'],
                'Name': row['name']
            })

        cc = {
            fid: [pk2id['contributor'][r['contributor_pk']] for r in rows]
            for fid, rows in itertools.groupby(
                self.read('contributioncontributor',
                          key=lambda d: (d['contribution_pk'], d['primary'] ==
                                         'f', int(d['ord']))).values(),
                lambda r: r['contribution_pk'])
        }

        areas = self.read('area')
        chapters = self.read('contribution', extended='chapter')

        for row in self.read('parameter',
                             extended='feature',
                             pkmap=pk2id,
                             key=lambda d: fid_key(d['id'])).values():
            args.writer.objects['ParameterTable'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Area':
                areas[chapters[row['contribution_pk']]['area_pk']]['name'],
                'Chapter':
                chapters[row['contribution_pk']]['name'],
                'Contributor_ID':
                cc[row['contribution_pk']],
            })

        for row in self.read(
                'domainelement',
                pkmap=pk2id,
                key=lambda d:
            (fid_key(d['id'].split('-')[0]), int(d['number']))).values():
            args.writer.objects['CodeTable'].append({
                'ID':
                row['id'],
                'Parameter_ID':
                pk2id['parameter'][row['parameter_pk']],
                'Name':
                row['name'],
                'Description':
                row['description'],
                'Number':
                int(row['number']),
                'icon':
                json.loads(row['jsondata'])['icon'],
            })

        identifier = self.read('identifier')
        lang2id = collections.defaultdict(
            lambda: collections.defaultdict(list))
        for row in self.read('languageidentifier').values():
            id_ = identifier[row['identifier_pk']]
            lang2id[row['language_pk']][id_['type']].append(
                (id_['name'], id_['description']))

        families = self.read('family', pkmap=pk2id)
        genera = self.read('genus', pkmap=pk2id)

        for row in self.read('language', extended='walslanguage',
                             pkmap=pk2id).values():
            id = row['id']
            genus = genera[row['genus_pk']]
            family = families[genus['family_pk']]
            if row['name'] == genus['name'] == family['name']:
                # an isolate!
                genus = family = None
            iso_codes = set(i[0]
                            for i in lang2id[row['pk']].get('iso639-3', []))
            glottocodes = [
                i[0] for i in lang2id[row['pk']].get('glottolog', [])
            ]
            args.writer.objects['LanguageTable'].append({
                'ID':
                id,
                'Name':
                row['name'],
                'ISO639P3code':
                list(iso_codes)[0] if len(iso_codes) == 1 else None,
                'Glottocode':
                glottocodes[0] if len(glottocodes) == 1 else None,
                'ISO_codes':
                sorted(iso_codes),
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Genus':
                genus['name'] if genus else None,
                'Subfamily':
                genus['subfamily'] if genus else None,
                'Family':
                family['name'] if family else None,
                'Samples_100':
                row['samples_100'] == 't',
                'Samples_200':
                row['samples_200'] == 't',
            })
        args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID'])

        refs = {
            dpid: [
                str(
                    Reference(
                        source=str(r[1]),
                        desc=r[2].replace('[', ')').replace(']', ')').replace(
                            ';', '.').strip() if r[2] else None))
                for r in refs_
            ]
            for dpid, refs_ in itertools.groupby(refs, lambda r: r[0])
        }

        vsdict = self.read('valueset', pkmap=pk2id)

        examples = self.read('sentence', pkmap=pk2id)
        igts = {}
        for ex in examples.values():
            if all(ex[k] for k in ['description', 'analyzed', 'gloss']):
                a, g = ex['analyzed'].split(), ex['gloss'].split()
                if len(a) != len(g):
                    a, g = [ex['analyzed']], [ex['gloss']]
                igts[ex['pk']] = ex['id']
                args.writer.objects['ExampleTable'].append({
                    'ID':
                    ex['id'],
                    'Language_ID':
                    pk2id['language'][ex['language_pk']],
                    'Primary_Text':
                    ex['name'],
                    'Translated_Text':
                    ex['description'],
                    'Analyzed_Word':
                    a,
                    'Gloss':
                    g,
                })
        example_by_value = {
            vpk: [r['sentence_pk'] for r in rows]
            for vpk, rows in itertools.groupby(
                self.read('valuesentence', key=lambda d: d['value_pk']).values(
                ), lambda d: d['value_pk'])
        }

        for row in self.read('value').values():
            vs = vsdict[row['valueset_pk']]
            comment = None
            ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])]
            if len(ex) == 1 and not any(
                    ex[0][k] for k in ['description', 'analyzed', 'gloss']):
                comment = ex[0]['name']
                del example_by_value[row['pk']]
            args.writer.objects['ValueTable'].append({
                'ID':
                vs['id'],
                'Language_ID':
                pk2id['language'][vs['language_pk']],
                'Parameter_ID':
                pk2id['parameter'][vs['parameter_pk']],
                'Value':
                pk2id['domainelement'][row['domainelement_pk']].split('-')[1],
                'Code_ID':
                pk2id['domainelement'][row['domainelement_pk']],
                'Comment':
                comment,
                'Source':
                refs.get(vs['pk'], []),
                'Example_ID':
                sorted(igts[epk]
                       for epk in example_by_value.get(row['pk'], [])
                       if epk in igts),
            })

        args.writer.objects['ValueTable'].sort(
            key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID'])))

        altnames = []
        for lpk in lang2id:
            for type in lang2id[lpk]:
                if type == 'name':
                    for name, prov in lang2id[lpk][type]:
                        altnames.append((prov, name, pk2id['language'][lpk]))

        lnid = 0
        for (type, name), rows in itertools.groupby(sorted(altnames), lambda t:
                                                    (t[0], t[1])):
            lnid += 1
            args.writer.objects['language_names.csv'].append({
                'ID':
                str(lnid),
                'Language_ID': [r[2] for r in rows],
                'Name':
                name,
                'Provider':
                type,
            })
Ejemplo n.º 22
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        pk2id = collections.defaultdict(dict)

        skip_source = [
            'Lous-1969',  # -> Loos-1969
            'Payne-1990',  # -> Payne-1990a
        ]
        updated_source_keys = {
            'Anonymous-nd': 'North-East-Frontier-Agency-1963',
        }
        updated_source_names = {
            'North-East-Frontier-Agency-1963':
            'North East Frontier Agency 1963',
        }
        sources = parse_string(
            self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'),
            'bibtex')
        gbs_lg_refs = collections.defaultdict(set)
        src_names = {}
        for s in self.read('source', pkmap=pk2id).values():
            if s['id'] in skip_source:
                continue
            s['id'] = updated_source_keys.get(s['id'], s['id'])
            src_names[s['id']] = updated_source_names.get(s['id'], s['name'])
            try:
                jsd = json.loads(s['jsondata'])
                if 'wals_code' in jsd:
                    [gbs_lg_refs[c].add(s['id']) for c in jsd['wals_code']]
                gbs = jsd['gbs']
                if gbs['id'].strip():
                    sef = sources.entries[s['id']].fields
                    sef['google_book_search_id'] = gbs['id'].strip()
                    sef['google_book_viewability'] = gbs['accessInfo'][
                        'viewability'].strip()
            except (json.decoder.JSONDecodeError, KeyError):
                continue

        chapters = self.read('contribution', extended='chapter', pkmap=pk2id)

        refs = []
        crefs = collections.defaultdict(list)
        for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True):
            if row['source_pk']:
                sid = pk2id['source'][row['source_pk']]
                if sid not in skip_source:
                    refs.append(
                        (row['valueset_pk'], updated_source_keys.get(sid, sid),
                         row['description']))
        srcids = set(r[1] for r in refs)
        for row in self.raw_dir.read_csv('contributionreference.csv',
                                         dicts=True):
            sid = pk2id['source'][row['source_pk']]
            if sid not in crefs[pk2id['contribution'][row['contribution_pk']]]:
                crefs[pk2id['contribution'][row['contribution_pk']]].append(
                    sid)
                srcids.add(sid)
        unused_srcids = []
        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))
            else:
                unused_srcids.append(id_)
            # add language references out of bibtex tag 'wals_code'
            # to ensure that nothing was missed in raw/languagesource.csv (37 cases)
            if 'wals_code' in e.fields:
                [
                    gbs_lg_refs[c].add(id_)
                    for c in e.fields['wals_code'].split('; ')
                ]

        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in unused_srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))

        editors = {
            e['contributor_pk']: int(e['ord'])
            for e in self.read('editor', key=lambda r: int(r['ord'])).values()
        }

        contributors = self.read('contributor',
                                 pkmap=pk2id,
                                 key=lambda r: r['id'])
        for row in contributors.values():
            args.writer.objects['contributors.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Url':
                row['url'],
                'Editor_Ord':
                editors[row['pk']] if row['pk'] in editors else 0,
            })

        cc = {
            chapters[fid]['id']:
            [(r['primary'], pk2id['contributor'][r['contributor_pk']])
             for r in rows]
            for fid, rows in itertools.groupby(
                self.read('contributioncontributor',
                          key=lambda d: (d['contribution_pk'], d['primary'] ==
                                         'f', int(d['ord']))).values(),
                lambda r: r['contribution_pk'])
        }

        areas = self.read('area')
        for row in areas.values():
            args.writer.objects['areas.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'dbpedia_url':
                row['dbpedia_url'],
            })

        for row in self.read('parameter',
                             extended='feature',
                             pkmap=pk2id,
                             key=lambda d: fid_key(d['id'])).values():
            args.writer.objects['ParameterTable'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Chapter_ID':
                chapters[row['contribution_pk']]['id'],
            })

        for row in self.read(
                'domainelement',
                pkmap=pk2id,
                key=lambda d:
            (fid_key(d['id'].split('-')[0]), int(d['number']))).values():
            args.writer.objects['CodeTable'].append({
                'ID':
                row['id'],
                'Parameter_ID':
                pk2id['parameter'][row['parameter_pk']],
                'Name':
                row['name'],
                'Description':
                row['description'],
                'Number':
                int(row['number']),
                'icon':
                json.loads(row['jsondata'])['icon'],
            })

        identifier = self.read('identifier')
        lang2id = collections.defaultdict(
            lambda: collections.defaultdict(list))
        for row in self.read('languageidentifier').values():
            id_ = identifier[row['identifier_pk']]
            lang2id[row['language_pk']][id_['type']].append(
                (id_['name'], id_['description']))

        families = self.read('family', pkmap=pk2id)
        genera = self.read('genus', pkmap=pk2id)
        countries = self.read('country', pkmap=pk2id)
        lang2country = collections.defaultdict(list)
        for c in self.read('countrylanguage').values():
            lang2country[c['language_pk']].append(
                pk2id['country'][c['country_pk']])
        lrefs = collections.defaultdict(list)
        for c in self.read('languagesource').values():
            sid = pk2id['source'][c['source_pk']]
            sid = updated_source_keys.get(sid, sid)
            if sid not in lrefs[c['language_pk']]:
                lrefs[c['language_pk']].append(sid)

        for row in self.read('language', extended='walslanguage',
                             pkmap=pk2id).values():
            id = row['id']
            genus = genera[row['genus_pk']]
            genus_icon = genus['icon'] if genus else ''
            family = families[genus['family_pk']]
            if row['name'] == genus['name'] == family['name']:
                # an isolate!
                genus = family = None
            iso_codes = set(i[0]
                            for i in lang2id[row['pk']].get('iso639-3', []))
            glottocodes = [
                i[0] for i in lang2id[row['pk']].get('glottolog', [])
            ]
            srcs = lrefs[row['pk']]
            if id in gbs_lg_refs:
                [srcs.append(s) for s in gbs_lg_refs[id] if s not in srcs]
            args.writer.objects['LanguageTable'].append({
                'ID':
                id,
                'Name':
                row['name'].strip(),
                'ISO639P3code':
                list(iso_codes)[0] if len(iso_codes) == 1 else None,
                'Glottocode':
                glottocodes[0] if len(glottocodes) == 1 else None,
                'ISO_codes':
                sorted(iso_codes),
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Macroarea':
                row['macroarea'],
                'Genus':
                genus['name'] if genus else None,
                'GenusIcon':
                genus_icon,
                'Subfamily':
                genus['subfamily'] if genus else None,
                'Family':
                family['name'] if family else None,
                'Samples_100':
                row['samples_100'] == 't',
                'Samples_200':
                row['samples_200'] == 't',
                'Country_ID':
                lang2country[row['pk']],
                'Source':
                sorted(srcs),
            })
        args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID'])

        refs = {
            dpid: [
                str(
                    Reference(
                        source=str(r[1]),
                        desc=r[2].replace('[', ')').replace(']', ')').replace(
                            ';', '.').strip() if r[2] else None))
                for r in refs_
            ]
            for dpid, refs_ in itertools.groupby(refs, lambda r: r[0])
        }

        vsdict = self.read('valueset', pkmap=pk2id)

        examples = self.read('sentence', pkmap=pk2id)
        igts = {}
        for ex in examples.values():
            if all(ex[k] for k in ['description', 'analyzed', 'gloss']):
                a, g = ex['analyzed'].split(), ex['gloss'].split()
                if len(a) != len(g):
                    a, g = [ex['analyzed']], [ex['gloss']]
                igts[ex['pk']] = ex['id']
                args.writer.objects['ExampleTable'].append({
                    'ID':
                    ex['id'],
                    'Language_ID':
                    pk2id['language'][ex['language_pk']],
                    'Primary_Text':
                    ex['name'],
                    'Translated_Text':
                    ex['description'],
                    'Analyzed_Word':
                    a,
                    'Gloss':
                    g,
                })
        example_by_value = {
            vpk: [r['sentence_pk'] for r in rows]
            for vpk, rows in itertools.groupby(
                self.read('valuesentence', key=lambda d: d['value_pk']).values(
                ), lambda d: d['value_pk'])
        }

        for row in self.read('value').values():
            vs = vsdict[row['valueset_pk']]
            comment = None
            ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])]
            if len(ex) == 1 and not any(
                    ex[0][k] for k in ['description', 'analyzed', 'gloss']):
                comment = re.sub(r'[\r\n]', '', ex[0]['xhtml'])
                del example_by_value[row['pk']]
            args.writer.objects['ValueTable'].append({
                'ID':
                vs['id'],
                'Language_ID':
                pk2id['language'][vs['language_pk']],
                'Parameter_ID':
                pk2id['parameter'][vs['parameter_pk']],
                'Value':
                pk2id['domainelement'][row['domainelement_pk']].split('-')[1],
                'Code_ID':
                pk2id['domainelement'][row['domainelement_pk']],
                'Comment':
                comment,
                'Source':
                refs.get(vs['pk'], []),
                'Example_ID':
                sorted(igts[epk]
                       for epk in example_by_value.get(row['pk'], [])
                       if epk in igts),
            })

        args.writer.objects['ValueTable'].sort(
            key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID'])))

        altnames = []
        for lpk in lang2id:
            for type in lang2id[lpk]:
                if type == 'name':
                    for name, prov in lang2id[lpk][type]:
                        altnames.append((prov, name, pk2id['language'][lpk]))

        lnid = 0
        for (type, name), rows in itertools.groupby(sorted(altnames), lambda t:
                                                    (t[0], t[1])):
            lnid += 1
            args.writer.objects['language_names.csv'].append({
                'ID':
                str(lnid),
                'Language_ID': [r[2] for r in rows],
                'Name':
                name.strip(),
                'Provider':
                type,
            })

        for c in sorted(countries.values(), key=lambda x: x['id']):
            args.writer.objects['countries.csv'].append({
                'ID': c['id'],
                'Name': c['name'],
            })

        desc_dir = self.raw_dir / 'descriptions'
        src_pattern = re.compile(
            'src="https?://wals.info/static/descriptions/(?P<sid>s?[0-9]+)/images/(?P<fname>[^"]+)"'
        )

        def repl(m):
            p = desc_dir.joinpath(m.group('sid'), 'images', m.group('fname'))
            if p.exists():
                return 'src="{0}"'.format(data_url(p))
            return m.string[m.start():m.end()]

        descs = {}
        docs_dir = self.cldf_dir / 'docs'
        docs_dir.mkdir(exist_ok=True)
        for d in desc_dir.iterdir():
            if d.is_dir():
                descs[d.stem] = src_pattern.sub(
                    repl,
                    d.joinpath('body.xhtml').read_text(encoding='utf8'))

        for c in sorted(chapters.values(), key=lambda x: int(x['sortkey'])):
            if c['id'] in descs:
                fname = docs_dir / 'chapter_{}.html'.format(c['id'])
                with io.open(fname, 'w', encoding='utf-8') as f:
                    f.write(descs[c['id']])
            cid, wcid = [], []
            if c['id'] in cc:
                cid = [co[1] for co in cc[c['id']] if co[0] == 't']
                wcid = [co[1] for co in cc[c['id']] if co[0] == 'f']
            args.writer.objects['chapters.csv'].append({
                'ID':
                c['id'],
                'Name':
                c['name'],
                'wp_slug':
                c['wp_slug'],
                'Number':
                c['sortkey'],
                'Area_ID':
                areas[c['area_pk']]['id'] if c['area_pk'] in areas else '',
                'Source':
                crefs.get(c['id'], []),
                'Contributor_ID':
                cid,
                'With_Contributor_ID':
                wcid,
            })
Ejemplo n.º 23
0
    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        for concept in self.concepts:
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "age_term_pq",
                        "age_term_aq",
                        "borr_source",
                        "borr_qual",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for lex in args.writer.add_lexemes(**kw):
                    if cogid != "?":
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))
Ejemplo n.º 24
0
    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        bib = parse_string(self.raw_dir.read('Borrowing_references.bib'),
                           'bibtex')
        for k, v in bib.entries.items():
            args.writer.add_sources(
                Source.from_entry(slug(k, lowercase=False), v))

        args.writer.cldf.add_component(
            'BorrowingTable', {
                'name': 'Likelihood',
                'dc:description':
                'Likelihood of borrowing (*possible*, *probable* or *clear*).',
                'datatype': {
                    'base': 'string',
                    'format': 'possible|clear|probable'
                }
            }, {
                'name': 'SourceLanguoid',
                'dc:description': 'Borrowing source of lexeme.',
            })
        args.writer.cldf['FormTable', 'form'].required = False
        args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS
        args.writer.cldf['FormTable', 'value'].required = False
        args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \
            "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \
            "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \
            "meanings in Proto-Uralic."

        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        inlists = {r['mng_item']: r for r in self._read('Meaning_lists')}
        attrs = [
            k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank'
        ]
        for concept in self.concepts:
            if concept['ID'] in inlists:
                memberships = {
                    k.replace('-', '_'): v == '1'
                    for k, v in inlists[concept['ID']].items()
                    if k.replace('-', '_') in attrs
                }
                concept.update(memberships)
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                if language['item'] in NULL_ITEMS:
                    language['etym_notes'] = language['etym_notes'] + language[
                        'item']
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for i, lex in enumerate(args.writer.add_lexemes(**kw)):
                    lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[
                        'Form']
                    if cogid not in ["?", "0"]:
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))
                    if language['borr_qual']:
                        c = ': borrowed to Pre-Permic'
                        ref = language['ref_borr']
                        if c in ref:
                            comment = c[1:].strip()
                            ref = ref.replace(c, '')
                        else:
                            comment = None
                        args.writer.objects['BorrowingTable'].append(
                            dict(
                                ID=lex['ID'],
                                Target_Form_ID=lex['ID'],
                                SourceLanguoid=language['borr_source'],
                                Likelihood=language['borr_qual'],
                                Source=bibkeys(ref),
                                Comment=comment,
                            ))
Ejemplo n.º 25
0
    def to_cldf(self,
                concept_map,
                unmapped,
                citekey=None,
                source=None,
                concept_key=None):
        if concept_key is None:
            concept_key = lambda entry: entry.word_id

        if not self.language.glottocode:
            unmapped.languages.add(
                (self.language.id, self.language.name, self.language.iso))

        with CldfDataset((
                'ID',
                'Language_ID',
                'Language_iso',
                'Language_name',
                'Language_local_ID',
                'Parameter_ID',
                'Parameter_name',
                'Parameter_local_ID',
                'Value',
                'Value_in_source',
                'Segments',
                'Context',
                'Source',
                'Cognate_Set',
                'Comment',
                'Loan',
        ),
                         self.dataset,
                         subset=self.language.id) as ds:
            ds.metadata['dc:creator'] = self.language.author
            ds.metadata['dc:identifier'] = self.url('language.php?id=%s' %
                                                    self.language.id)
            if self.language.typedby:
                ds.metadata['dc:contributor'] = self.language.typedby
            if self.language.checkedby:
                ds.metadata['dc:contributor'] = self.language.checkedby
            if self.language.notes:
                ds.metadata['dc:description'] = self.language.notes

            ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name
            ds.table.schema.columns['Loan'].datatype = 'boolean'
            ds.table.schema.columns['Parameter_local_ID'].valueUrl = \
                self.url('word.php?v=1{Parameter_local_ID}')
            ds.table.schema.columns['Language_local_ID'].valueUrl = \
                self.url('language.php?id={Language_local_ID}')

            ref = None
            if citekey and source:
                ref = citekey
                ds.sources.add(Source('misc', citekey, title=source))

            for entry in self.entries:
                if entry.name == '?':
                    continue
                if not (citekey and source):
                    src = entry.e.find('source')
                    if src and getattr(src, 'text'):
                        ref = slug(text_type(src.text))
                        ds.sources.add(Source('misc', ref, title=src.text))
                cid = concept_map.get(concept_key(entry))
                if not cid:
                    unmapped.concepts.add((entry.word_id, entry.word))
                for i, (form, context) in enumerate(util.split(entry.name)):
                    ds.add_row([
                        '{0}-{1}'.format(entry.id, i + 1),
                        self.language.glottocode,
                        self.language.iso,
                        self.language.name,
                        self.language.id,
                        cid,
                        entry.word,
                        entry.word_id,
                        util.clean_form(form),
                        form,
                        '',
                        context,
                        ref,
                        entry.cognacy,
                        entry.comment or '',
                        entry.loan == 'L',
                    ])
            segmentize(ds)
        return ds
Ejemplo n.º 26
0
def cldf(dataset, concepticon, **kw):
    concepticon = {
        c.english: c.concepticon_id
        for c in dataset.conceptlist.concepts.values()
    }
    concepticon['you (sing.)'] = concepticon['you (sing.) (thou)']
    concepticon['you (pl.)'] = concepticon['you (pl.) (ye)']
    concepticon['to itch/itchy'] = concepticon['to itch/to be itchy']
    concepticon['medicine'] = concepticon['medicine/juice']
    concepticon['excrement/shit'] = concepticon['feces/excrement/shit']

    language_map = {
        'Tampuon': 'Tampuan',
        'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan',
        'Jru-Laven\u02d0': 'Jru-Laven',
        'Pnar-Jaintia': 'Pnar',
        'K-Surin': 'Khmer-Surin',
    }

    languages = {}
    words = []

    with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader:
        for i, row in enumerate(reader):
            if 3 <= i < 125:
                languages[row[1]] = row
            elif i > 334:
                words.append(row)

    lids = [int(float(r[0])) for r in languages.values()]
    assert min(lids) == 1 and max(lids) == 122

    glottolog = dataset.glottocode_by_iso
    glottolog.update(
        {l['NAME']: l['GLOTTOCODE'] or None
         for l in dataset.languages})

    sources = {}
    for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]),
                              lambda r: r[6]):
        langs = [l[1] for l in langs]
        src = Source('misc', '_'.join(map(slug, langs)), title=src)
        for lang in langs:
            sources[lang] = src
    sources['cognates'] = getEvoBibAsSource(SOURCE)

    unmapped = Unmapped()
    with CldfDataset((
            'ID',
            'Language_ID',
            'Language_name',
            'Language_iso',
            'Parameter_ID',
            'Parameter_name',
            'Value',
            'Segments',
            'Source',
            'Comment',
    ), dataset) as ds:
        ds.sources.add(*sources.values())
        D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']}
        for i, row in enumerate(words):
            form = row[4]
            if not form or form in '*-':
                continue
            assert row[1] in concepticon
            lang = language_map.get(row[3], row[3].strip())
            assert lang in languages
            gc = glottolog.get(glottolog.get(languages[lang][7]), lang)
            if not gc:
                unmapped.languages.add(('', lang, languages[lang][7]))
            # get segments
            segments = clean_string(form)[0]
            # get cognate identifier
            cogid = row[5] if row[5].strip() and row[5].strip() != '*' else (
                'e%s' % i)
            cogid = row[1] + '-' + cogid
            lid = '{0}-{1}'.format(ds.name, i + 1)
            ds.add_row([
                lid,
                glottolog.get(lang, glottolog.get(languages[lang][7])), lang,
                languages[lang][7], concepticon[row[1]], row[1], form,
                segments, sources[lang].id, None
            ])
            D[i + 1] = [lid, lang, row[1], form, segments, cogid]
        wl = lp.Wordlist(D)
        wl.renumber('cog')
        alm = lp.Alignments(wl)
        dataset.cognates.extend(
            iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE)))

    unmapped.pprint()
Ejemplo n.º 27
0
def test_Source_persons():
    assert len(list(Source.persons('A. Meier'))) == 1
    assert len(list(Source.persons('Meier, A.B.'))) == 1
    assert len(list(Source.persons('A. Meier, B. Meier, C.Meier'))) == 3
Ejemplo n.º 28
0
def test_Source_persons():
    assert len(list(Source.persons('A. Meier'))) == 1
    assert len(list(Source.persons('Meier, A.B.'))) == 1
    assert len(list(Source.persons('A. Meier, B. Meier, C.Meier'))) == 3
Ejemplo n.º 29
0
def test_Sources_with_None_values(tmpdir):
    src = Sources()
    src.add(Source('book', 'huber2005', title=None))
    bib = tmpdir / 'test.bib'
    src.write(str(bib))
Ejemplo n.º 30
0
def test_Source_from_bibtex():
    bibtex = '@' + BIB.split('@')[1]
    assert Source.from_bibtex(bibtex).entry.fields['title'] == 'Obrazy z Rus'
Ejemplo n.º 31
0
    def test_Source_persons(self):
        from pycldf.sources import Source

        self.assertEqual(len(list(Source.persons('A. Meier'))), 1)
        self.assertEqual(len(list(Source.persons('Meier, A.B.'))), 1)
        self.assertEqual(len(list(Source.persons('A. Meier, B. Meier, C.Meier'))), 3)
Ejemplo n.º 32
0
def test_Source_from_bibtex():
    bibtex = '@' + BIB.split('@')[1]
    assert Source.from_bibtex(bibtex).entry.fields['title'] == 'Obrazy z Rus'
Ejemplo n.º 33
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        rmap = collections.defaultdict(list)
        for row in self.query('select * from "references"'):
            rmap[row['language_id']].append(row['id'])
            kw = {}
            for key, bkey in {
                    'authors': 'author',
                    'year': 'year',
                    'article_title': 'title',
                    'editors': 'editor',
                    'book_title': 'booktitle',
                    'city': 'address',
                    'issue': 'issue',
                    'journal': 'journal',
                    'pages': 'pages',
                    'publisher': 'publisher',
                    'series_title': 'series',
                    'url': 'url',
                    'volume': 'volume',
                    'additional_information': 'note',
            }.items():
                if row[key]:
                    kw[bkey] = row[key]
            args.writer.add_sources(
                Source(row['bibtex_type'] or 'misc', str(row['id']), **kw))
        all_sources = {rid for lang_refs in rmap.values() for rid in lang_refs}

        contributors = {
            r['id']: r['name']
            for r in self.raw_dir.read_csv('contributors.csv', dicts=True)
        }
        contributions = {}
        for lid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('contributions.csv', dicts=True),
                       key=lambda d:
                       (d['language_id'], int(d['sort_order_number'] or 0))),
                lambda d: d['language_id'],
        ):
            contributions[int(lid)] = ' and '.join([
                contributors[d['person_id']] for d in rows
                if d['person_id'] in contributors
            ])

        lang2gl = {l['Name']: l['Glottocode'] for l in self.languages}
        lmap = {}
        for lang in self.query('select * from languages order by name'):
            lang['glottolog_code'] = lang2gl.get(lang['name'],
                                                 lang['glottolog_code'])
            lmap[lang['id']] = lang['glottolog_code']
            name = lang['name']
            if lang['variety']:
                name += ' ({0})'.format(lang['variety'])
            args.writer.add_language(
                ID=lang['glottolog_code'],
                Name=name,
                Glottocode=lang['glottolog_code'],
                ISO639P3code=lang['iso_code'],
                Latitude=lang['latitude'],
                Longitude=lang['longitude'],
                Comment=lang['comments'],
                contributors=contributions[lang['id']],
                continent=lang['continent'],
            )

        args.writer.add_concepts(id_factory=lambda c: c.attributes[
            'label_for_url'].replace('/', '_'))
        cmap = {}
        for meaning in self.query("select * from meanings order by label"):
            cmap[meaning['id']] = meaning['label_for_url'].replace('/', '_')
            args.writer.add_concept(
                ID=meaning['label_for_url'].replace('/', '_'),
                Name=meaning['label'],
                typical_context=meaning['typical_context'],
            )

        fmap = {}
        forms = """\
select
    v.id, 
    v.verb_form,
    v.original_script,
    v.comment,
    v.simplex_or_complex,
    v.verb_type,
    v.language_id,
    v.coding_frame_id,
    mv.meaning_id
from 
    verbs as v, meanings_verbs as mv 
where
    v.id = mv.verb_id
order by v.language_id, mv.meaning_id;"""
        for row in self.query(forms):
            #
            # FIXME: flesh out data!
            #
            res = args.writer.add_forms_from_value(
                Value=row['verb_form'],
                Language_ID=lmap[row['language_id']],
                Parameter_ID=cmap[row['meaning_id']],
                Comment=row['comment'],
                Source=rmap[row['language_id']],
                verb_type=row['verb_type'],
                original_script=row['original_script'],
                simplex_or_complex=row['simplex_or_complex'],
                Basic_Coding_Frame_ID=row['coding_frame_id'],
                #
                # FIXME: mark singular (SG) and plural (PL)?
                #
            )
            fmap[row['id']] = [r['ID'] for r in res]

        #CREATE TABLE "examples" (
        # "id" integer(8),
        # "reference_id" integer(8),
        # "person_id" integer(8),
        # "original_orthography" text,
        # "media_file_name" varchar(255),
        # "media_file_timecode" varchar(255),
        # "reference_pages" varchar(255),
        # "number" integer);
        gloss_fix = {
            'find.out-NTR<STV>-OBJ-3.ERG DET= boy':
            'find.out-NTR<STV>-OBJ-3.ERG DET=boy',
            'scream<STV> DET= PL-child':
            'scream<STV> DET=PL-child',
            'boy.ERG brother.OBL-INSTR girl.OBL-LAT embrace(IV).ABS IV-fill-CAUS- PST':
            'boy.ERG brother.OBL-INSTR girl.OBL-LAT embrace(IV).ABS IV-fill-CAUS-PST',
        }
        morphemes_fix = {
            r'tande-mne brbr-äm y/ram\te': 'tande-mne brbr-äm y/ram_te',
        }
        ex2verb = {}
        for row in self.query("""\
select e.id, group_concat(ev.verb_id, ' ') as vids
from examples as e, examples_verbs as ev 
where e.id = ev.example_id group by e.id"""):
            ex2verb[row['id']] = []
            for vid in set(row['vids'].split()):
                ex2verb[row['id']].extend(fmap[int(vid)])

        def example_id(row):
            return '{0}-{1}'.format(lmap[row['language_id']], row['number'])

        def get_source(row):
            source = row.get('reference_id')
            if source and source in all_sources:
                ref_pages = row.get('reference_pages') or ''
                # occasionally, `reference_pages` seems to contain stuff other
                # than page numbers -- try to ignore that..
                if re.fullmatch(r'\s*\d+([-–]+\d+)?\s*', ref_pages):
                    return ['{}[{}]'.format(source, ref_pages)]
                else:
                    return [str(source)]
            else:
                return None

        maxnum = {}
        for row in self.query(
                "select * from examples order by language_id, number desc"):
            if row['language_id'] not in maxnum:
                maxnum[row['language_id']] = row['number']
            if row['number'] == 0:
                row['number'] = maxnum[
                    row['language_id']] = maxnum[row['language_id']] + 1
            row['gloss'] = gloss_fix.get(row['gloss'], row['gloss'])
            row['analyzed_text'] = morphemes_fix.get(row['analyzed_text'],
                                                     row['analyzed_text'])
            args.writer.objects['ExampleTable'].append(
                dict(
                    ID=example_id(row),
                    Language_ID=lmap[row['language_id']],
                    Primary_Text=row['primary_text'],
                    Analyzed_Word=row['analyzed_text'].split(),
                    Gloss=row['gloss'].split(),
                    Translated_Text=row['translation'],
                    Comment=row['comment'],
                    Original_Orthography=row['original_orthography'],
                    Translation_Other=row['translation_other'],
                    Number=row['number'],
                    Example_Type=row['example_type'],
                    Source=get_source(row),
                    Form_IDs=sorted(ex2verb.get(row['id'], [])),
                ))

        for row in self.query(
                "select * from microroles order by meaning_id, id"):
            args.writer.objects['microroles.csv'].append(
                dict(
                    ID=row['id'],
                    Name=row['name'],
                    Parameter_ID=cmap[row['meaning_id']],
                    Role_Letter=row['role_letter'],
                    Original_Or_New=row['original_or_new'],
                    Name_For_URL=row['name_for_url'],
                ))

        for row in self.query(
                "select * from coding_sets order by language_id, id"):
            lang_id = lmap.get(row['language_id'])
            if lang_id:
                args.writer.objects['coding-sets.csv'].append(
                    dict(
                        ID=row['id'],
                        Name=row['name'],
                        Comment=row['comment'],
                        Language_ID=lang_id,
                    ))

        for row in self.query(
                'select * from coding_frames order by language_id, id'):
            args.writer.objects['coding-frames.csv'].append(
                dict(
                    ID=row['id'],
                    Language_ID=lmap[row['language_id']],
                    Coding_Frame_Schema=row['coding_frame_schema'],
                    Description=row['description'],
                    Comment=row['comment'],
                    Derived=row['derived'],
                ))

        cf_roles = collections.defaultdict(list)
        for row in self.query(
                'SELECT coding_frame_index_number_id AS index_id, microrole_id'
                '  FROM coding_frame_index_numbers_microroles'):
            cf_roles[row['index_id']].append(str(row['microrole_id']))

        imap = {}
        coding_frames = {
            row['ID']
            for row in args.writer.objects['coding-frames.csv']
        }
        for row in self.query(
                'SELECT i.id, coding_frame_id, index_number, coding_set_id, argument_type'
                '  FROM coding_frame_index_numbers AS i'
                '  JOIN argument_types AS a'
                '    ON a.id = i.argument_type_id'
                '  ORDER BY coding_frame_id, index_number'):
            if row['coding_frame_id'] not in coding_frames:
                continue
            roles = cf_roles.get(row['id'])
            new_id = '{}-{}'.format(row['coding_frame_id'],
                                    row['index_number'])
            imap[row['id']] = new_id
            args.writer.objects['coding-frame-index-numbers.csv'].append(
                dict(
                    ID=new_id,
                    Coding_Frame_ID=row['coding_frame_id'],
                    Index_Number=row['index_number'],
                    Coding_Set_ID=row['coding_set_id'],
                    Argument_Type=row['argument_type'],
                    Microrole_IDs=roles,
                ))

        form_cf_roles = collections.OrderedDict()
        for row in self.query('SELECT verb_id, coding_frame_id, microrole_id'
                              '  FROM verb_coding_frame_microroles'):
            pair = (row['verb_id'], row['coding_frame_id'])
            if pair not in form_cf_roles:
                form_cf_roles[pair] = []
            form_cf_roles[pair].append(row['microrole_id'])

        args.writer.objects['form-coding-frame-microroles.csv'] = [{
            'ID':
            '{}-{}'.format(fmap[verb_id][0], cf_id),
            'Form_ID':
            fmap[verb_id][0],
            'Coding_Frame_ID':
            str(cf_id),
            'Microrole_IDs':
            list(map(str, role_ids)),
        } for (verb_id,
               cf_id), role_ids in form_cf_roles.items() if verb_id in fmap]

        cf_examples = collections.OrderedDict()
        for row in self.query(
                'select verb_id,coding_frame_id,language_id,number'
                '  from coding_frame_examples'
                '  join examples on example_id = examples.id'):
            verb_id = fmap.get(row['verb_id'])
            if verb_id:
                key = (verb_id[0], row['coding_frame_id'])
                if key not in cf_examples:
                    cf_examples[key] = []
                cf_examples[key].append(example_id(row))

        args.writer.objects['coding-frame-examples.csv'] = [{
            'ID':
            str(index + 1),
            'Form_ID':
            verb_id,
            'Coding_Frame_ID':
            cf_id,
            'Example_IDs':
            ex_ids,
        } for (index, ((verb_id, cf_id),
                       ex_ids)) in enumerate(cf_examples.items())]

        for row in self.query(
                "select * from alternations order by language_id, id"):
            args.writer.objects['alternations.csv'].append(
                dict(
                    ID=row['id'],
                    Name=re.sub(r'</?span[^>]*>', '', row['name'], flags=re.I),
                    Description=row['description'],
                    Language_ID=lmap[row['language_id']],
                    Alternation_Type=row['alternation_type'],
                    Coding_Frames_Text=row['coding_frames_text'],
                    Complexity=row['complexity'],
                ))

        av_examples = collections.defaultdict(list)
        for row in self.query(
                'select alternation_value_id, language_id, number'
                '  from alternation_values_examples as ve'
                '  join examples on ve.example_id = examples.id'
                '  order by alternation_value_id, language_id, number'):
            av_examples[row['alternation_value_id']].append(example_id(row))

        for row in self.query(
                'select * from alternation_values order by verb_id, alternation_id, id'
        ):
            vid = int(row['verb_id'])
            if vid in fmap and fmap[vid]:
                args.writer.objects['alternation-values.csv'].append(
                    dict(
                        ID=row['id'],
                        Form_ID=fmap[vid][0],
                        Alternation_ID=row['alternation_id'],
                        Alternation_Occurs=row['alternation_occurs'],
                        Comment=row['comment'],
                        Derived_Code_Frame_ID=row['derived_coding_frame_id'],
                        Example_IDs=av_examples.get(row['id']),
                    ))