Ejemplos de Wordlist en Python, ejemplos de pycldf.Wordlist en Python

Ejemplo n.º 1

0

Mostrar archivo

def reshape_dataset(dataset: pycldf.Wordlist,
                    add_column: bool = True) -> pycldf.Dataset:
    # check for existing cognateset table
    if dataset.column_names.cognatesets is None:
        # Create a Cognateset Table
        dataset.add_component("CognatesetTable")

    # add a concept column to the cognateset table
    if add_column:
        if dataset.column_names.cognatesets.parameterReference is None:
            dataset.add_columns("CognatesetTable", "Core_Concept_ID")
            c = dataset["CognatesetTable"].tableSchema.columns[-1]
            c.datatype = dataset["ParameterTable", "ID"].datatype
            c.propertyUrl = URITemplate(
                "http://cldf.clld.org/v1.0/terms.rdf#parameterReference")
            fname = dataset.write_metadata()
            # Reload dataset with new column definitions
            dataset = pycldf.Wordlist.from_metadata(fname)
    return dataset

Ejemplo n.º 2

0

Mostrar archivo

Archivo: add_concepticon.py Proyecto: Anaphory/lexedata

def add_concepticon_names(
    dataset: pycldf.Wordlist,
    column_name: str = "Concepticon_Gloss",
):
    # Create a concepticonReference column
    try:
        dataset.add_columns("ParameterTable", column_name)
        dataset.write_metadata()
    except ValueError:
        pass

    write_back = []
    for row in cli.tq(
            dataset["ParameterTable"],
            task="Write concepts with concepticon names to dataset",
    ):
        try:
            row[column_name] = concepticon.api.conceptsets[row[
                dataset.column_names.parameters.concepticonReference]].gloss
        except KeyError:
            pass

        write_back.append(row)

    dataset.write(ParameterTable=write_back)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: lexibank_dravlex.py Proyecto: lexibank/dravlex

    def cmd_makecldf(self, args):
        dsdir = self.dir / 'raw' / 'Verkerk-DravLex-622ac6e'
        dataset = Wordlist.from_metadata(dsdir / 'Wordlist-metadata.json')

        # load concepts
        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split('-')[-1]+ '_' + slug(c.english),
             lookup_factory="Name"
        )

        # load sources from original CLDF, and then the fieldwork source
        args.writer.add_sources(*self.raw_dir.read_bib(dsdir / 'sources.bib'))
        args.writer.add_sources()
        
        # load languages
        args.writer.add_languages()

        # load cognates
        cogs = {
            r['Form_ID']: r for r in self.raw_dir.read_csv(dsdir / 'cognates.csv', dicts=True)
        }
        
        # load data
        for row in self.raw_dir.read_csv(dsdir / 'forms.csv', dicts=True):
            src = row['Source'].split(";") if row['Source'] else ['KolipakamFW']
            cog = cogs.get(row['ID'])
            for lex in args.writer.add_forms_from_value(
                Local_ID=row['ID'],
                Language_ID=row['Language_ID'],
                Parameter_ID=concepts[row['Parameter_ID']],
                Value=row['Form'],
                Source=src,
                Comment=row['status'],
                Loan=True if row['status'] else False
            ):
                args.writer.add_cognate(
                    lexeme=lex,
                    ID=cog['ID'],
                    Source=cog['Source'],
                    Cognateset_ID=cog['Cognateset_ID'],
                    Comment=", ".join([cog['Comment'], cog['source_comment']])
                )

Ejemplo n.º 4

0

Mostrar archivo

Archivo: wold2cldf.py Proyecto: glottobank/pycldf

def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable',
        {
            'name': 'Relation',
            'datatype': {'base': 'string', 'format': 'immediate|earlier'}},
        {'name': 'Certain', 'datatype': 'boolean'})

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {}  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(dict(
            ID=vid,
            Language_ID=lid,
            Parameter_ID=pid,
            Form=uname,
        ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(dict(
                ID='{0}'.format(i + 1),
                Source_Form_ID=uid,
                Target_Form_ID=upk2uid[tpk],
                Relation=lrel,
                Certain=lcertain,
            ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(dict(
                ID=id,
                Name=name,
                Category=semantic_category,
                SemanticField_ID=sfid,
                SemanticField=sfname,
            ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()

Ejemplo n.º 5

0

Mostrar archivo

from xml.etree.ElementTree import fromstring
from xmljson import badgerfish as bf
import sys
import os
import csv
from cariban import util
from pycldf import Wordlist
import re
import pyperclip
lexicon = {}
cariban_data = Wordlist.from_metadata("../cariban_data.json")
for row in cariban_data["FormTable"]:
    alt_glossings = row["Glossing"].split("; ")
    if len(alt_glossings) == 0 or alt_glossings[0] == "":
        meanings = row["Parameter_ID"]
    else:
        meanings = alt_glossings
    lexicon[row["ID"]] = {
        "forms": row["Form"],
        "meanings": meanings,
        "language": row["Language_ID"],
    }
# print(lexicon)


def search_lexicon(form, meaning, language):
    if len(lexicon) == 0:
        return ("X")
    if not meaning.isupper():
        new_meaning = meaning.replace(".", " ")
    else:

Ejemplo n.º 6

0

Mostrar archivo

Archivo: guess_concepticon.py Proyecto: sellisd/lexedata

def add_concepticon_references(
    dataset: pycldf.Wordlist,
    gloss_languages: t.Mapping[str, str],
    status_update: t.Optional[str],
    overwrite: bool = False,
) -> None:
    """Guess Concepticon links for a multilingual Concept table.

    Fill the concepticonReference column of the dateset's ParameterTable with
    best guesses for Concepticon IDs, based on gloss columns in different
    languages.

    Parameters
    ==========
    dataset: A pycldf.Wordlist with a concepticonReference column in its
        ParameterTable
    gloss_languages: A mapping from ParameterTable column names to ISO-639-1
        language codes that Concepticon has concept lists for (eg. en, fr, de,
        es, zh, pt)
    status_update: String written to Status_Column of #parameterTable if provided
    overwrite: Overwrite existing Concepticon references

    """
    # TODO: If this function took only dataset["ParameterTable"] and the name
    # of the target column in there as arguments, one could construct examples
    # that just use the Iterable API and therefore look nice as doctests.
    gloss_lists: t.Dict[str, t.List[str]] = {column: [] for column in gloss_languages}

    for row in dataset["ParameterTable"]:
        for column, glosses in gloss_lists.items():
            glosses.append(row[column] or "?")  # Concepticon abhors empty glosses.

    targets = {
        language: concepticon.api._get_map_for_language(language, None)
        for language in gloss_languages.values()
    }

    cmaps: t.List[t.Dict[int, t.Tuple[t.List[int], int]]] = [
        (
            concept_map2(
                glosses,
                [i[1] for i in targets[gloss_languages[column]]],
                similarity_level=2,
                language=gloss_languages[column],
            ),
            # What a horrendous API! Why can't it return glosses or IDs instead
            # of, as it does now, target-indices so I have to schlepp target along
            # with the results?
            targets[gloss_languages[column]],
        )
        for column, glosses in gloss_lists.items()
    ]

    write_back = []
    for i, row in enumerate(dataset["ParameterTable"]):
        if overwrite or not row.get(
            dataset.column_names.parameters.concepticonReference
        ):
            matches = [(m.get(i, ([], 10)), t) for m, t in cmaps]
            best_sim = min(x[0][1] for x in matches)
            best_matches = [t[m] for (ms, s), t in matches for m in ms if s <= best_sim]
            c: t.Counter[str] = collections.Counter(id for id, string in best_matches)
            if len(c) > 1:
                print(row, best_sim, c.most_common())
                row[
                    dataset.column_names.parameters.concepticonReference
                ] = c.most_common(1)[0][0]
            elif len(c) < 1:
                print(row)
            else:
                row[
                    dataset.column_names.parameters.concepticonReference
                ] = c.most_common(1)[0][0]
        # add status update if given
        if status_update:
            row["Status_Column"] = status_update
        write_back.append(row)

    dataset.write(ParameterTable=write_back)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: initializedb.py Proyecto: clld/cobl2

from clldutils.path import Path
from clldutils.misc import slug
from pycldf import Wordlist
from clld_phylogeny_plugin.models import Phylogeny, TreeLabel, LanguageTreeLabel
from clld_cognacy_plugin.models import Cognate, Cognateset
from csvw.dsv import reader


import cobl2
from cobl2 import models
import clld_cognacy_plugin.models


data_file_path = Path(cobl2.__file__).parent / '../..' / 'iecor'

ds = Wordlist.from_metadata(data_file_path / 'cldf' / 'cldf-metadata.json')

photos = {
    p.stem: p.as_posix() for p in
    (Path(cobl2.__file__).parent / '../..' / 'CoBL-public' / 'cobl' / 'static' / 'contributors').iterdir()
    if p.suffix == '.jpg'}
for k, v in {
    'Kümmel': 'Kuemmel',
    'de Vaan': 'deVaan',
    'Dewey-Findell': 'Dewey',
}.items():
    photos[k] = photos[v]


def main(args):
    data = Data()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: clld.py Proyecto: liualg/pylexibank

 def original_cldf(self):
     for p in self.raw_dir.iterdir():
         if p.name.endswith(MD_SUFFIX):
             return Wordlist.from_metadata(p)

Ejemplo n.º 9

0

Mostrar archivo

def add_cognate_table(
    dataset: pycldf.Wordlist,
    split: bool = True,
    logger: cli.logging.Logger = cli.logger,
) -> None:
    if "CognateTable" in dataset:
        return
    dataset.add_component("CognateTable")

    # TODO: Check if that cognatesetReference is already a foreign key to
    # elsewhere (could be a CognatesetTable, could be whatever), because then
    # we need to transfer that knowledge.

    # Load anything that's useful for a cognate set table: Form IDs, segments,
    # segment slices, cognateset references, alignments
    columns = {
        "id": dataset["FormTable", "id"].name,
        "concept": dataset["FormTable", "parameterReference"].name,
        "form": dataset["FormTable", "form"].name,
    }
    for property in [
            "segments", "segmentSlice", "cognatesetReference", "alignment"
    ]:
        try:
            columns[property] = dataset["FormTable", property].name
        except KeyError:
            pass
    cognate_judgements = []
    forms = cache_table(dataset, columns=columns)
    forms_without_segments = 0
    for f, form in cli.tq(forms.items(),
                          task="Extracting cognate judgements from forms…"):
        if form.get("cognatesetReference"):
            if split:
                cogset = util.string_to_id("{:}-{:}".format(
                    form["concept"], form["cognatesetReference"]))
            else:
                cogset = form["cognatesetReference"]
            judgement = {
                "ID": f,
                "Form_ID": f,
                "Cognateset_ID": cogset,
            }
            try:
                judgement["Segment_Slice"] = form["segmentSlice"]
            except KeyError:
                try:
                    if not form["segments"]:
                        raise ValueError("No segments")
                    if ("+" in form["segments"]
                            and dataset["FormTable",
                                        "cognatesetReference"].separator):
                        logger.warning(
                            "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!"
                        )
                    judgement["Segment_Slice"] = [
                        "1:{:d}".format(len(form["segments"]))
                    ]
                except (KeyError, TypeError, ValueError):
                    forms_without_segments += 1
                    if forms_without_segments >= 5:
                        pass
                    else:
                        logger.warning(
                            f"No segments found for form {f} ({form['form']})."
                        )
            # What does an alignment mean without segments or their slices?
            # Doesn't matter, if we were given one, we take it.
            judgement["Alignment"] = form.get("alignment")
            cognate_judgements.append(judgement)

    if forms_without_segments >= 5:
        logger.warning(
            "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.",
            forms_without_segments,
        )

    # Delete the cognateset column
    cols = dataset["FormTable"].tableSchema.columns
    remove = {
        dataset["FormTable", c].name
        for c in ["cognatesetReference", "segmentSlice", "alignment"]
        if ("FormTable", c) in dataset
    }

    def clean_form(form):
        for c in remove:
            form.pop(c, None)
        return form

    forms = [clean_form(form) for form in dataset["FormTable"]]
    for c in remove:
        ix = cols.index(dataset["FormTable", c])
        del cols[ix]

    dataset.write(FormTable=forms)

    dataset.write(CognateTable=cognate_judgements)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: cldf.py Proyecto: SimonGreenhill/lingpy

def to_cldf(wordlist,
            path='cldf',
            source_path=None,
            ref="cogid",
            segments="tokens",
            form="ipa",
            note='note',
            form_in_source="value",
            source=None,
            alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(ID=lid,
                                  Name=wordlist[idx, 'doculect'],
                                  glottocode=wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(ID=pid,
                                   Name=wordlist[idx, 'concept'],
                                   Concepticon_ID=wordlist[idx,
                                                           'concepticon_id'])

        forms.append(
            dict(ID=str(idx),
                 Language_ID=lid,
                 Parameter_ID=pid,
                 form_in_source=wordlist[idx, form_in_source] or ''
                 if form_in_source else '',
                 Form=wordlist[idx, form] or '' if form else '',
                 Segments=wordlist[idx, segments] or '' if segments else '',
                 Source=[wordlist[idx, source]] or [] if source else [],
                 Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(
                dict(ID=str(idx),
                     Form_ID=str(idx),
                     Cognateset_ID=wordlist[idx, ref],
                     Alignment=wordlist[idx, alignment] or ['']
                     if alignment else ['']))

    ds.write(FormTable=forms,
             LanguageTable=languages.values(),
             ParameterTable=parameters.values(),
             CognateTable=cognates)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: initializedb.py Proyecto: clld/parabank

def main(args):  # pragma: no cover
    wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json'))

    data = Data()
    data.add(
        common.Contributor, 'barthwolfgang',
        id='barthwolfgang',
        name="Wolfgang Barth",
        url="http://www.dynamicsoflanguage.edu.au/")
    #
    # FIXME: get dataset attributes from CLDF metadata!
    #
    dataset = common.Dataset(
        id='parabank',
        name='Parabank Pronouns',
        description='Database of pronouns',
        domain='parabank.clld.org',
        publisher_name="CoEDL Centre of Excellence for the Dynamics of Language",
        publisher_place="Canberra, Australia",
        publisher_url="http://www.dynamicsoflanguage.edu.au/",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0'})
    DBSession.add(dataset)

    for i, editor in enumerate(['barthwolfgang']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    contrib = common.Contribution(id='contrib', name='the contribution')

    for l in wl['LanguageTable']:
        lang = data.add(
            models.ParabankLanguage,
            l['ID'],
            id=l['ID'],
            name=l['Name'],
            description=l['Notes'],
            source=l['Source_Citation'],
            classification=l['Classification'],
        )
        add_language_codes(data, lang, None, glottocode=l['Glottocode'])

    for p in wl['ParameterTable']:
        data.add(
            common.Parameter,
            p['ID'],
            id=p['ID'],
            name='{0} ({1})'.format(p['Name'], p['ID']),
            #description=p['Description'],
        )

    for f in wl['FormTable']:
        vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=vsid,
                language=data['ParabankLanguage'][f['Language_ID']],
                parameter=data['Parameter'][f['Parameter_ID']],
                contribution=contrib)

        DBSession.add(models.Word(
            id=f['ID'],
            name=f['Form'],
            comment=f.get('Comment'),
            original=f['Original_parameter'],
            valueset=vs))

    load_families(
        data,
        [(l.glottocode, l) for l in data['ParabankLanguage'].values()],
        glottolog_repos=args.data_file('glottolog'),
        isolates_icon='tcccccc')

Ejemplo n.º 12

0

Mostrar archivo

Archivo: wold2cldf.py Proyecto: afcarl/pycldf

def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID',
                     'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable', {
            'name': 'Relation',
            'datatype': {
                'base': 'string',
                'format': 'immediate|earlier'
            }
        }, {
            'name': 'Certain',
            'datatype': 'boolean'
        })

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {
    }  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(
            dict(
                ID=vid,
                Language_ID=lid,
                Parameter_ID=pid,
                Form=uname,
            ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(
                dict(
                    ID='{0}'.format(i + 1),
                    Source_Form_ID=uid,
                    Target_Form_ID=upk2uid[tpk],
                    Relation=lrel,
                    Certain=lcertain,
                ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(
                dict(
                    ID=id,
                    Name=name,
                    Category=semantic_category,
                    SemanticField_ID=sfid,
                    SemanticField=sfname,
                ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: cldf.py Proyecto: LinguList/lingpy

def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid",
        segments="tokens", form="ipa", note='note', form_in_source="value",
        source=None, alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(
            read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(
                    ID=lid,
                    Name=wordlist[idx, 'doculect'],
                    Glottocode = wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(
                ID=pid,
                Name=wordlist[idx, 'concept'],
                Concepticon_ID=wordlist[idx, 'concepticon_id'])

        forms.append(dict(
            ID=str(idx),
            Language_ID=lid,
            Parameter_ID=pid,
            form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '',
            Form=wordlist[idx, form] or '' if form else '',
            Segments=wordlist[idx, segments] or '' if segments else '',
            Source=[wordlist[idx, source]] or [] if source else [],
            Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(dict(ID=str(idx), Form_ID=str(idx),
                Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx,
                    alignment] or [''] if alignment else ['']))

    ds.write(
        FormTable=forms,
        LanguageTable=languages.values(),
        ParameterTable=parameters.values(),
        CognateTable=cognates)