Esempio n. 1
0
def reshape_dataset(dataset: pycldf.Wordlist,
                    add_column: bool = True) -> pycldf.Dataset:
    # check for existing cognateset table
    if dataset.column_names.cognatesets is None:
        # Create a Cognateset Table
        dataset.add_component("CognatesetTable")

    # add a concept column to the cognateset table
    if add_column:
        if dataset.column_names.cognatesets.parameterReference is None:
            dataset.add_columns("CognatesetTable", "Core_Concept_ID")
            c = dataset["CognatesetTable"].tableSchema.columns[-1]
            c.datatype = dataset["ParameterTable", "ID"].datatype
            c.propertyUrl = URITemplate(
                "http://cldf.clld.org/v1.0/terms.rdf#parameterReference")
            fname = dataset.write_metadata()
            # Reload dataset with new column definitions
            dataset = pycldf.Wordlist.from_metadata(fname)
    return dataset
Esempio n. 2
0
def add_concepticon_names(
    dataset: pycldf.Wordlist,
    column_name: str = "Concepticon_Gloss",
):
    # Create a concepticonReference column
    try:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
    except ValueError:
        pass

    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon names to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[row[
                dataset.column_names.parameters.concepticonReference]].gloss
        except KeyError:
            pass

        write_back.append(row)

    dataset.write(ParameterTable=write_back)
Esempio n. 3
0
    def cmd_makecldf(self, args):
        dsdir = self.dir / 'raw' / 'Verkerk-DravLex-622ac6e'
        dataset = Wordlist.from_metadata(dsdir / 'Wordlist-metadata.json')

        # load concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split('-')[-1]+ '_' + slug(c.english),
             lookup_factory="Name"
        )

        # load sources from original CLDF, and then the fieldwork source
        args.writer.add_sources(*self.raw_dir.read_bib(dsdir / 'sources.bib'))
        args.writer.add_sources()
        
        # load languages
        args.writer.add_languages()

        # load cognates
        cogs = {
            r['Form_ID']: r for r in self.raw_dir.read_csv(dsdir / 'cognates.csv', dicts=True)
        }
        
        # load data
        for row in self.raw_dir.read_csv(dsdir / 'forms.csv', dicts=True):
            src = row['Source'].split(";") if row['Source'] else ['KolipakamFW']
            cog = cogs.get(row['ID'])
            for lex in args.writer.add_forms_from_value(
                Local_ID=row['ID'],
                Language_ID=row['Language_ID'],
                Parameter_ID=concepts[row['Parameter_ID']],
                Value=row['Form'],
                Source=src,
                Comment=row['status'],
                Loan=True if row['status'] else False
            ):
                args.writer.add_cognate(
                    lexeme=lex,
                    ID=cog['ID'],
                    Source=cog['Source'],
                    Cognateset_ID=cog['Cognateset_ID'],
                    Comment=", ".join([cog['Comment'], cog['source_comment']])
                )
Esempio n. 4
0
def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable',
        {
            'name': 'Relation',
            'datatype': {'base': 'string', 'format': 'immediate|earlier'}},
        {'name': 'Certain', 'datatype': 'boolean'})

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {}  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(dict(
            ID=vid,
            Language_ID=lid,
            Parameter_ID=pid,
            Form=uname,
        ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(dict(
                ID='{0}'.format(i + 1),
                Source_Form_ID=uid,
                Target_Form_ID=upk2uid[tpk],
                Relation=lrel,
                Certain=lcertain,
            ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(dict(
                ID=id,
                Name=name,
                Category=semantic_category,
                SemanticField_ID=sfid,
                SemanticField=sfname,
            ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()
Esempio n. 5
0
from xml.etree.ElementTree import fromstring
from xmljson import badgerfish as bf
import sys
import os
import csv
from cariban import util
from pycldf import Wordlist
import re
import pyperclip
lexicon = {}
cariban_data = Wordlist.from_metadata("../cariban_data.json")
for row in cariban_data["FormTable"]:
    alt_glossings = row["Glossing"].split("; ")
    if len(alt_glossings) == 0 or alt_glossings[0] == "":
        meanings = row["Parameter_ID"]
    else:
        meanings = alt_glossings
    lexicon[row["ID"]] = {
        "forms": row["Form"],
        "meanings": meanings,
        "language": row["Language_ID"],
    }
# print(lexicon)


def search_lexicon(form, meaning, language):
    if len(lexicon) == 0:
        return ("X")
    if not meaning.isupper():
        new_meaning = meaning.replace(".", " ")
    else:
Esempio n. 6
0
def add_concepticon_references(
    dataset: pycldf.Wordlist,
    gloss_languages: t.Mapping[str, str],
    status_update: t.Optional[str],
    overwrite: bool = False,
) -> None:
    """Guess Concepticon links for a multilingual Concept table.

    Fill the concepticonReference column of the dateset's ParameterTable with
    best guesses for Concepticon IDs, based on gloss columns in different
    languages.

    Parameters
    ==========
    dataset: A pycldf.Wordlist with a concepticonReference column in its
        ParameterTable
    gloss_languages: A mapping from ParameterTable column names to ISO-639-1
        language codes that Concepticon has concept lists for (eg. en, fr, de,
        es, zh, pt)
    status_update: String written to Status_Column of #parameterTable if provided
    overwrite: Overwrite existing Concepticon references

    """
    # TODO: If this function took only dataset["ParameterTable"] and the name
    # of the target column in there as arguments, one could construct examples
    # that just use the Iterable API and therefore look nice as doctests.
    gloss_lists: t.Dict[str, t.List[str]] = {column: [] for column in gloss_languages}

    for row in dataset["ParameterTable"]:
        for column, glosses in gloss_lists.items():
            glosses.append(row[column] or "?")  # Concepticon abhors empty glosses.

    targets = {
        language: concepticon.api._get_map_for_language(language, None)
        for language in gloss_languages.values()
    }

    cmaps: t.List[t.Dict[int, t.Tuple[t.List[int], int]]] = [
        (
            concept_map2(
                glosses,
                [i[1] for i in targets[gloss_languages[column]]],
                similarity_level=2,
                language=gloss_languages[column],
            ),
            # What a horrendous API! Why can't it return glosses or IDs instead
            # of, as it does now, target-indices so I have to schlepp target along
            # with the results?
            targets[gloss_languages[column]],
        )
        for column, glosses in gloss_lists.items()
    ]

    write_back = []
    for i, row in enumerate(dataset["ParameterTable"]):
        if overwrite or not row.get(
            dataset.column_names.parameters.concepticonReference
        ):
            matches = [(m.get(i, ([], 10)), t) for m, t in cmaps]
            best_sim = min(x[0][1] for x in matches)
            best_matches = [t[m] for (ms, s), t in matches for m in ms if s <= best_sim]
            c: t.Counter[str] = collections.Counter(id for id, string in best_matches)
            if len(c) > 1:
                print(row, best_sim, c.most_common())
                row[
                    dataset.column_names.parameters.concepticonReference
                ] = c.most_common(1)[0][0]
            elif len(c) < 1:
                print(row)
            else:
                row[
                    dataset.column_names.parameters.concepticonReference
                ] = c.most_common(1)[0][0]
        # add status update if given
        if status_update:
            row["Status_Column"] = status_update
        write_back.append(row)

    dataset.write(ParameterTable=write_back)
Esempio n. 7
0
from clldutils.path import Path
from clldutils.misc import slug
from pycldf import Wordlist
from clld_phylogeny_plugin.models import Phylogeny, TreeLabel, LanguageTreeLabel
from clld_cognacy_plugin.models import Cognate, Cognateset
from csvw.dsv import reader


import cobl2
from cobl2 import models
import clld_cognacy_plugin.models


data_file_path = Path(cobl2.__file__).parent / '../..' / 'iecor'

ds = Wordlist.from_metadata(data_file_path / 'cldf' / 'cldf-metadata.json')

photos = {
    p.stem: p.as_posix() for p in
    (Path(cobl2.__file__).parent / '../..' / 'CoBL-public' / 'cobl' / 'static' / 'contributors').iterdir()
    if p.suffix == '.jpg'}
for k, v in {
    'Kümmel': 'Kuemmel',
    'de Vaan': 'deVaan',
    'Dewey-Findell': 'Dewey',
}.items():
    photos[k] = photos[v]


def main(args):
    data = Data()
Esempio n. 8
0
 def original_cldf(self):
     for p in self.raw_dir.iterdir():
         if p.name.endswith(MD_SUFFIX):
             return Wordlist.from_metadata(p)
Esempio n. 9
0
def add_cognate_table(
    dataset: pycldf.Wordlist,
    split: bool = True,
    logger: cli.logging.Logger = cli.logger,
) -> None:
    if "CognateTable" in dataset:
        return
    dataset.add_component("CognateTable")

    # TODO: Check if that cognatesetReference is already a foreign key to
    # elsewhere (could be a CognatesetTable, could be whatever), because then
    # we need to transfer that knowledge.

    # Load anything that's useful for a cognate set table: Form IDs, segments,
    # segment slices, cognateset references, alignments
    columns = {
        "id": dataset["FormTable", "id"].name,
        "concept": dataset["FormTable", "parameterReference"].name,
        "form": dataset["FormTable", "form"].name,
    }
    for property in [
            "segments", "segmentSlice", "cognatesetReference", "alignment"
    ]:
        try:
            columns[property] = dataset["FormTable", property].name
        except KeyError:
            pass
    cognate_judgements = []
    forms = cache_table(dataset, columns=columns)
    forms_without_segments = 0
    for f, form in cli.tq(forms.items(),
                          task="Extracting cognate judgements from forms…"):
        if form.get("cognatesetReference"):
            if split:
                cogset = util.string_to_id("{:}-{:}".format(
                    form["concept"], form["cognatesetReference"]))
            else:
                cogset = form["cognatesetReference"]
            judgement = {
                "ID": f,
                "Form_ID": f,
                "Cognateset_ID": cogset,
            }
            try:
                judgement["Segment_Slice"] = form["segmentSlice"]
            except KeyError:
                try:
                    if not form["segments"]:
                        raise ValueError("No segments")
                    if ("+" in form["segments"]
                            and dataset["FormTable",
                                        "cognatesetReference"].separator):
                        logger.warning(
                            "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!"
                        )
                    judgement["Segment_Slice"] = [
                        "1:{:d}".format(len(form["segments"]))
                    ]
                except (KeyError, TypeError, ValueError):
                    forms_without_segments += 1
                    if forms_without_segments >= 5:
                        pass
                    else:
                        logger.warning(
                            f"No segments found for form {f} ({form['form']})."
                        )
            # What does an alignment mean without segments or their slices?
            # Doesn't matter, if we were given one, we take it.
            judgement["Alignment"] = form.get("alignment")
            cognate_judgements.append(judgement)

    if forms_without_segments >= 5:
        logger.warning(
            "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.",
            forms_without_segments,
        )

    # Delete the cognateset column
    cols = dataset["FormTable"].tableSchema.columns
    remove = {
        dataset["FormTable", c].name
        for c in ["cognatesetReference", "segmentSlice", "alignment"]
        if ("FormTable", c) in dataset
    }

    def clean_form(form):
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)
Esempio n. 10
0
def to_cldf(wordlist,
            path='cldf',
            source_path=None,
            ref="cogid",
            segments="tokens",
            form="ipa",
            note='note',
            form_in_source="value",
            source=None,
            alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(ID=lid,
                                  Name=wordlist[idx, 'doculect'],
                                  glottocode=wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(ID=pid,
                                   Name=wordlist[idx, 'concept'],
                                   Concepticon_ID=wordlist[idx,
                                                           'concepticon_id'])

        forms.append(
            dict(ID=str(idx),
                 Language_ID=lid,
                 Parameter_ID=pid,
                 form_in_source=wordlist[idx, form_in_source] or ''
                 if form_in_source else '',
                 Form=wordlist[idx, form] or '' if form else '',
                 Segments=wordlist[idx, segments] or '' if segments else '',
                 Source=[wordlist[idx, source]] or [] if source else [],
                 Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(
                dict(ID=str(idx),
                     Form_ID=str(idx),
                     Cognateset_ID=wordlist[idx, ref],
                     Alignment=wordlist[idx, alignment] or ['']
                     if alignment else ['']))

    ds.write(FormTable=forms,
             LanguageTable=languages.values(),
             ParameterTable=parameters.values(),
             CognateTable=cognates)
Esempio n. 11
0
def main(args):  # pragma: no cover
    wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json'))

    data = Data()
    data.add(
        common.Contributor, 'barthwolfgang',
        id='barthwolfgang',
        name="Wolfgang Barth",
        url="http://www.dynamicsoflanguage.edu.au/")
    #
    # FIXME: get dataset attributes from CLDF metadata!
    #
    dataset = common.Dataset(
        id='parabank',
        name='Parabank Pronouns',
        description='Database of pronouns',
        domain='parabank.clld.org',
        publisher_name="CoEDL Centre of Excellence for the Dynamics of Language",
        publisher_place="Canberra, Australia",
        publisher_url="http://www.dynamicsoflanguage.edu.au/",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0'})
    DBSession.add(dataset)

    for i, editor in enumerate(['barthwolfgang']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    contrib = common.Contribution(id='contrib', name='the contribution')

    for l in wl['LanguageTable']:
        lang = data.add(
            models.ParabankLanguage,
            l['ID'],
            id=l['ID'],
            name=l['Name'],
            description=l['Notes'],
            source=l['Source_Citation'],
            classification=l['Classification'],
        )
        add_language_codes(data, lang, None, glottocode=l['Glottocode'])

    for p in wl['ParameterTable']:
        data.add(
            common.Parameter,
            p['ID'],
            id=p['ID'],
            name='{0} ({1})'.format(p['Name'], p['ID']),
            #description=p['Description'],
        )

    for f in wl['FormTable']:
        vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=vsid,
                language=data['ParabankLanguage'][f['Language_ID']],
                parameter=data['Parameter'][f['Parameter_ID']],
                contribution=contrib)

        DBSession.add(models.Word(
            id=f['ID'],
            name=f['Form'],
            comment=f.get('Comment'),
            original=f['Original_parameter'],
            valueset=vs))

    load_families(
        data,
        [(l.glottocode, l) for l in data['ParabankLanguage'].values()],
        glottolog_repos=args.data_file('glottolog'),
        isolates_icon='tcccccc')
Esempio n. 12
0
def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID',
                     'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable', {
            'name': 'Relation',
            'datatype': {
                'base': 'string',
                'format': 'immediate|earlier'
            }
        }, {
            'name': 'Certain',
            'datatype': 'boolean'
        })

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {
    }  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(
            dict(
                ID=vid,
                Language_ID=lid,
                Parameter_ID=pid,
                Form=uname,
            ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(
                dict(
                    ID='{0}'.format(i + 1),
                    Source_Form_ID=uid,
                    Target_Form_ID=upk2uid[tpk],
                    Relation=lrel,
                    Certain=lcertain,
                ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(
                dict(
                    ID=id,
                    Name=name,
                    Category=semantic_category,
                    SemanticField_ID=sfid,
                    SemanticField=sfname,
                ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()
Esempio n. 13
0
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid",
        segments="tokens", form="ipa", note='note', form_in_source="value",
        source=None, alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(
            read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(
                    ID=lid,
                    Name=wordlist[idx, 'doculect'],
                    Glottocode = wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(
                ID=pid,
                Name=wordlist[idx, 'concept'],
                Concepticon_ID=wordlist[idx, 'concepticon_id'])

        forms.append(dict(
            ID=str(idx),
            Language_ID=lid,
            Parameter_ID=pid,
            form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '',
            Form=wordlist[idx, form] or '' if form else '',
            Segments=wordlist[idx, segments] or '' if segments else '',
            Source=[wordlist[idx, source]] or [] if source else [],
            Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(dict(ID=str(idx), Form_ID=str(idx),
                Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx,
                    alignment] or [''] if alignment else ['']))

    ds.write(
        FormTable=forms,
        LanguageTable=languages.values(),
        ParameterTable=parameters.values(),
        CognateTable=cognates)