Esempio n. 1
0
def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable',
        {
            'name': 'Relation',
            'datatype': {'base': 'string', 'format': 'immediate|earlier'}},
        {'name': 'Certain', 'datatype': 'boolean'})

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {}  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(dict(
            ID=vid,
            Language_ID=lid,
            Parameter_ID=pid,
            Form=uname,
        ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(dict(
                ID='{0}'.format(i + 1),
                Source_Form_ID=uid,
                Target_Form_ID=upk2uid[tpk],
                Relation=lrel,
                Certain=lcertain,
            ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(dict(
                ID=id,
                Name=name,
                Category=semantic_category,
                SemanticField_ID=sfid,
                SemanticField=sfname,
            ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()
Esempio n. 2
0
def make_cldf(db, out, mid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = Wordlist.in_dir(out)

    # Source words are not coded for meaning slots, so we have to relax the schema:
    ds['FormTable', 'Parameter_ID'].required = False

    # We add the WOLD language metadata:
    ds.add_component('LanguageTable')

    # some metadata about the comparison meanings:
    ds.add_component('ParameterTable', 'Category', 'SemanticField_ID',
                     'SemanticField')

    # and the information on borrowings (aka loanwords):
    ds.add_component(
        'BorrowingTable', {
            'name': 'Relation',
            'datatype': {
                'base': 'string',
                'format': 'immediate|earlier'
            }
        }, {
            'name': 'Certain',
            'datatype': 'boolean'
        })

    # Now we collect the data by querying the database:
    forms, languages = [], {}

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    pids = set()  # store all meaning IDs occurring for any form
    upk2uid = {
    }  # store the mapping of word pks to Form_ID, for relating loans
    for row in db.execute(SQL_FORMS.format(mid)):
        lpk, lid, lname, vspk, vid, pid, uname, upk = row
        upk2uid[upk] = vid
        ids = lids.get(lpk, {})
        pids.add(pid)
        languages[lpk] = dict(
            ID=lid,
            Name=lname,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
        )
        forms.append(
            dict(
                ID=vid,
                Language_ID=lid,
                Parameter_ID=pid,
                Form=uname,
            ))

    borrowings = []
    sourceforms = {}
    for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))):
        lpk, lid, lname, form, uid, tpk, lrel, lcertain = row
        ids = lids.get(lpk, {})
        if form != 'Unidentifiable':
            borrowings.append(
                dict(
                    ID='{0}'.format(i + 1),
                    Source_Form_ID=uid,
                    Target_Form_ID=upk2uid[tpk],
                    Relation=lrel,
                    Certain=lcertain,
                ))
            sourceforms[uid] = dict(
                ID=uid,
                Language_ID=lid,
                Parameter_ID=None,
                Form=form,
            )
            languages[lpk] = dict(
                ID=lid,
                Name=lname,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
            )

    meanings = []
    for row in db.execute(SQL_MEANING):
        id, name, semantic_category, sfid, sfname = row
        if id in pids:
            meanings.append(
                dict(
                    ID=id,
                    Name=name,
                    Category=semantic_category,
                    SemanticField_ID=sfid,
                    SemanticField=sfname,
                ))

    ds.write(
        FormTable=forms + list(sourceforms.values()),
        ParameterTable=meanings,
        LanguageTable=languages.values(),
        BorrowingTable=borrowings,
    )
    ds.validate()
Esempio n. 3
0
def to_cldf(wordlist,
            path='cldf',
            source_path=None,
            ref="cogid",
            segments="tokens",
            form="ipa",
            note='note',
            form_in_source="value",
            source=None,
            alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(ID=lid,
                                  Name=wordlist[idx, 'doculect'],
                                  glottocode=wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(ID=pid,
                                   Name=wordlist[idx, 'concept'],
                                   Concepticon_ID=wordlist[idx,
                                                           'concepticon_id'])

        forms.append(
            dict(ID=str(idx),
                 Language_ID=lid,
                 Parameter_ID=pid,
                 form_in_source=wordlist[idx, form_in_source] or ''
                 if form_in_source else '',
                 Form=wordlist[idx, form] or '' if form else '',
                 Segments=wordlist[idx, segments] or '' if segments else '',
                 Source=[wordlist[idx, source]] or [] if source else [],
                 Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(
                dict(ID=str(idx),
                     Form_ID=str(idx),
                     Cognateset_ID=wordlist[idx, ref],
                     Alignment=wordlist[idx, alignment] or ['']
                     if alignment else ['']))

    ds.write(FormTable=forms,
             LanguageTable=languages.values(),
             ParameterTable=parameters.values(),
             CognateTable=cognates)
Esempio n. 4
0
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid",
        segments="tokens", form="ipa", note='note', form_in_source="value",
        source=None, alignment=None):
    """Convert a wordlist in LingPy to CLDF.
    
    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A regular Wordlist object (or similar).
    path : str (default='cldf')
        The name of the directory to which the files will be written.
    source_path : str (default=None)
        If available, specify the path of your BibTex file with the sources.
    ref : str (default="cogid")
        The column in which the cognate sets are stored.
    segments : str (default="tokens")
        The column in which the segmented phonetic strings are stored.
    form : str (default="ipa")
        The column in which the unsegmented phonetic strings are stored.
    note : str (default=None)
        The column in which you store your comments.
    form_in_source : str (default=None)
        The column in which you store the original form in the source.
    source : str (default=None)
        The column in which you store your source information. 
    alignment : str (default="alignment")
        The column in which you store the alignments.
    """
    if not cldf:
        raise ValueError('The package pycldf needs to be installed')

    # create cldf-dataset
    ds = CLDF_Wordlist.in_dir(path)
    # add sources if they are available
    ds.add_sources(
            read_text(source_path) if source_path else '')
    # add components
    ds.add_component('LanguageTable')
    ds.add_component('ParameterTable', 'Concepticon_ID')
    ds.add_component('CognateTable')
    ds.add_columns('FormTable', 'form_in_source')

    languages, parameters, forms, cognates = {}, {}, [], []
    for idx in wordlist:
        lid = slug(wordlist[idx, 'doculect'])
        if lid not in languages:
            languages[lid] = dict(
                    ID=lid,
                    Name=wordlist[idx, 'doculect'],
                    Glottocode = wordlist[idx, 'glottocode'])

        pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept'])
        if pid not in parameters:
            parameters[pid] = dict(
                ID=pid,
                Name=wordlist[idx, 'concept'],
                Concepticon_ID=wordlist[idx, 'concepticon_id'])

        forms.append(dict(
            ID=str(idx),
            Language_ID=lid,
            Parameter_ID=pid,
            form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '',
            Form=wordlist[idx, form] or '' if form else '',
            Segments=wordlist[idx, segments] or '' if segments else '',
            Source=[wordlist[idx, source]] or [] if source else [],
            Comment=wordlist[idx, note] or '' if note else ''))

        if ref:
            cognates.append(dict(ID=str(idx), Form_ID=str(idx),
                Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx,
                    alignment] or [''] if alignment else ['']))

    ds.write(
        FormTable=forms,
        LanguageTable=languages.values(),
        ParameterTable=parameters.values(),
        CognateTable=cognates)