def make_cldf(db, out, mid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = Wordlist.in_dir(out) # Source words are not coded for meaning slots, so we have to relax the schema: ds['FormTable', 'Parameter_ID'].required = False # We add the WOLD language metadata: ds.add_component('LanguageTable') # some metadata about the comparison meanings: ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField') # and the information on borrowings (aka loanwords): ds.add_component( 'BorrowingTable', { 'name': 'Relation', 'datatype': {'base': 'string', 'format': 'immediate|earlier'}}, {'name': 'Certain', 'datatype': 'boolean'}) # Now we collect the data by querying the database: forms, languages = [], {} lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] pids = set() # store all meaning IDs occurring for any form upk2uid = {} # store the mapping of word pks to Form_ID, for relating loans for row in db.execute(SQL_FORMS.format(mid)): lpk, lid, lname, vspk, vid, pid, uname, upk = row upk2uid[upk] = vid ids = lids.get(lpk, {}) pids.add(pid) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) forms.append(dict( ID=vid, Language_ID=lid, Parameter_ID=pid, Form=uname, )) borrowings = [] sourceforms = {} for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))): lpk, lid, lname, form, uid, tpk, lrel, lcertain = row ids = lids.get(lpk, {}) if form != 'Unidentifiable': borrowings.append(dict( ID='{0}'.format(i + 1), Source_Form_ID=uid, Target_Form_ID=upk2uid[tpk], Relation=lrel, Certain=lcertain, )) sourceforms[uid] = dict( ID=uid, Language_ID=lid, Parameter_ID=None, Form=form, ) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) meanings = [] for row in db.execute(SQL_MEANING): id, name, semantic_category, sfid, sfname = row if id in pids: meanings.append(dict( ID=id, Name=name, Category=semantic_category, SemanticField_ID=sfid, SemanticField=sfname, )) ds.write( FormTable=forms + list(sourceforms.values()), ParameterTable=meanings, LanguageTable=languages.values(), BorrowingTable=borrowings, ) ds.validate()
def make_cldf(db, out, mid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = Wordlist.in_dir(out) # Source words are not coded for meaning slots, so we have to relax the schema: ds['FormTable', 'Parameter_ID'].required = False # We add the WOLD language metadata: ds.add_component('LanguageTable') # some metadata about the comparison meanings: ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField') # and the information on borrowings (aka loanwords): ds.add_component( 'BorrowingTable', { 'name': 'Relation', 'datatype': { 'base': 'string', 'format': 'immediate|earlier' } }, { 'name': 'Certain', 'datatype': 'boolean' }) # Now we collect the data by querying the database: forms, languages = [], {} lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] pids = set() # store all meaning IDs occurring for any form upk2uid = { } # store the mapping of word pks to Form_ID, for relating loans for row in db.execute(SQL_FORMS.format(mid)): lpk, lid, lname, vspk, vid, pid, uname, upk = row upk2uid[upk] = vid ids = lids.get(lpk, {}) pids.add(pid) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) forms.append( dict( ID=vid, Language_ID=lid, Parameter_ID=pid, Form=uname, )) borrowings = [] sourceforms = {} for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))): lpk, lid, lname, form, uid, tpk, lrel, lcertain = row ids = lids.get(lpk, {}) if form != 'Unidentifiable': borrowings.append( dict( ID='{0}'.format(i + 1), Source_Form_ID=uid, Target_Form_ID=upk2uid[tpk], Relation=lrel, Certain=lcertain, )) sourceforms[uid] = dict( ID=uid, Language_ID=lid, Parameter_ID=None, Form=form, ) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) meanings = [] for row in db.execute(SQL_MEANING): id, name, semantic_category, sfid, sfname = row if id in pids: meanings.append( dict( ID=id, Name=name, Category=semantic_category, SemanticField_ID=sfid, SemanticField=sfname, )) ds.write( FormTable=forms + list(sourceforms.values()), ParameterTable=meanings, LanguageTable=languages.values(), BorrowingTable=borrowings, ) ds.validate()
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid", segments="tokens", form="ipa", note='note', form_in_source="value", source=None, alignment=None): """Convert a wordlist in LingPy to CLDF. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A regular Wordlist object (or similar). path : str (default='cldf') The name of the directory to which the files will be written. source_path : str (default=None) If available, specify the path of your BibTex file with the sources. ref : str (default="cogid") The column in which the cognate sets are stored. segments : str (default="tokens") The column in which the segmented phonetic strings are stored. form : str (default="ipa") The column in which the unsegmented phonetic strings are stored. note : str (default=None) The column in which you store your comments. form_in_source : str (default=None) The column in which you store the original form in the source. source : str (default=None) The column in which you store your source information. alignment : str (default="alignment") The column in which you store the alignments. """ if not cldf: raise ValueError('The package pycldf needs to be installed') # create cldf-dataset ds = CLDF_Wordlist.in_dir(path) # add sources if they are available ds.add_sources(read_text(source_path) if source_path else '') # add components ds.add_component('LanguageTable') ds.add_component('ParameterTable', 'Concepticon_ID') ds.add_component('CognateTable') ds.add_columns('FormTable', 'form_in_source') languages, parameters, forms, cognates = {}, {}, [], [] for idx in wordlist: lid = slug(wordlist[idx, 'doculect']) if lid not in languages: languages[lid] = dict(ID=lid, Name=wordlist[idx, 'doculect'], glottocode=wordlist[idx, 'glottocode']) pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept']) if pid not in parameters: parameters[pid] = dict(ID=pid, Name=wordlist[idx, 'concept'], Concepticon_ID=wordlist[idx, 'concepticon_id']) forms.append( dict(ID=str(idx), Language_ID=lid, Parameter_ID=pid, form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '', Form=wordlist[idx, form] or '' if form else '', Segments=wordlist[idx, segments] or '' if segments else '', Source=[wordlist[idx, source]] or [] if source else [], Comment=wordlist[idx, note] or '' if note else '')) if ref: cognates.append( dict(ID=str(idx), Form_ID=str(idx), Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx, alignment] or [''] if alignment else [''])) ds.write(FormTable=forms, LanguageTable=languages.values(), ParameterTable=parameters.values(), CognateTable=cognates)
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid", segments="tokens", form="ipa", note='note', form_in_source="value", source=None, alignment=None): """Convert a wordlist in LingPy to CLDF. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A regular Wordlist object (or similar). path : str (default='cldf') The name of the directory to which the files will be written. source_path : str (default=None) If available, specify the path of your BibTex file with the sources. ref : str (default="cogid") The column in which the cognate sets are stored. segments : str (default="tokens") The column in which the segmented phonetic strings are stored. form : str (default="ipa") The column in which the unsegmented phonetic strings are stored. note : str (default=None) The column in which you store your comments. form_in_source : str (default=None) The column in which you store the original form in the source. source : str (default=None) The column in which you store your source information. alignment : str (default="alignment") The column in which you store the alignments. """ if not cldf: raise ValueError('The package pycldf needs to be installed') # create cldf-dataset ds = CLDF_Wordlist.in_dir(path) # add sources if they are available ds.add_sources( read_text(source_path) if source_path else '') # add components ds.add_component('LanguageTable') ds.add_component('ParameterTable', 'Concepticon_ID') ds.add_component('CognateTable') ds.add_columns('FormTable', 'form_in_source') languages, parameters, forms, cognates = {}, {}, [], [] for idx in wordlist: lid = slug(wordlist[idx, 'doculect']) if lid not in languages: languages[lid] = dict( ID=lid, Name=wordlist[idx, 'doculect'], Glottocode = wordlist[idx, 'glottocode']) pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept']) if pid not in parameters: parameters[pid] = dict( ID=pid, Name=wordlist[idx, 'concept'], Concepticon_ID=wordlist[idx, 'concepticon_id']) forms.append(dict( ID=str(idx), Language_ID=lid, Parameter_ID=pid, form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '', Form=wordlist[idx, form] or '' if form else '', Segments=wordlist[idx, segments] or '' if segments else '', Source=[wordlist[idx, source]] or [] if source else [], Comment=wordlist[idx, note] or '' if note else '')) if ref: cognates.append(dict(ID=str(idx), Form_ID=str(idx), Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx, alignment] or [''] if alignment else [''])) ds.write( FormTable=forms, LanguageTable=languages.values(), ParameterTable=parameters.values(), CognateTable=cognates)