def reshape_dataset(dataset: pycldf.Wordlist, add_column: bool = True) -> pycldf.Dataset: # check for existing cognateset table if dataset.column_names.cognatesets is None: # Create a Cognateset Table dataset.add_component("CognatesetTable") # add a concept column to the cognateset table if add_column: if dataset.column_names.cognatesets.parameterReference is None: dataset.add_columns("CognatesetTable", "Core_Concept_ID") c = dataset["CognatesetTable"].tableSchema.columns[-1] c.datatype = dataset["ParameterTable", "ID"].datatype c.propertyUrl = URITemplate( "http://cldf.clld.org/v1.0/terms.rdf#parameterReference") fname = dataset.write_metadata() # Reload dataset with new column definitions dataset = pycldf.Wordlist.from_metadata(fname) return dataset
def add_concepticon_names( dataset: pycldf.Wordlist, column_name: str = "Concepticon_Gloss", ): # Create a concepticonReference column try: dataset.add_columns("ParameterTable", column_name) dataset.write_metadata() except ValueError: pass write_back = [] for row in cli.tq( dataset["ParameterTable"], task="Write concepts with concepticon names to dataset", ): try: row[column_name] = concepticon.api.conceptsets[row[ dataset.column_names.parameters.concepticonReference]].gloss except KeyError: pass write_back.append(row) dataset.write(ParameterTable=write_back)
def cmd_makecldf(self, args): dsdir = self.dir / 'raw' / 'Verkerk-DravLex-622ac6e' dataset = Wordlist.from_metadata(dsdir / 'Wordlist-metadata.json') # load concepts concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split('-')[-1]+ '_' + slug(c.english), lookup_factory="Name" ) # load sources from original CLDF, and then the fieldwork source args.writer.add_sources(*self.raw_dir.read_bib(dsdir / 'sources.bib')) args.writer.add_sources() # load languages args.writer.add_languages() # load cognates cogs = { r['Form_ID']: r for r in self.raw_dir.read_csv(dsdir / 'cognates.csv', dicts=True) } # load data for row in self.raw_dir.read_csv(dsdir / 'forms.csv', dicts=True): src = row['Source'].split(";") if row['Source'] else ['KolipakamFW'] cog = cogs.get(row['ID']) for lex in args.writer.add_forms_from_value( Local_ID=row['ID'], Language_ID=row['Language_ID'], Parameter_ID=concepts[row['Parameter_ID']], Value=row['Form'], Source=src, Comment=row['status'], Loan=True if row['status'] else False ): args.writer.add_cognate( lexeme=lex, ID=cog['ID'], Source=cog['Source'], Cognateset_ID=cog['Cognateset_ID'], Comment=", ".join([cog['Comment'], cog['source_comment']]) )
def make_cldf(db, out, mid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = Wordlist.in_dir(out) # Source words are not coded for meaning slots, so we have to relax the schema: ds['FormTable', 'Parameter_ID'].required = False # We add the WOLD language metadata: ds.add_component('LanguageTable') # some metadata about the comparison meanings: ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField') # and the information on borrowings (aka loanwords): ds.add_component( 'BorrowingTable', { 'name': 'Relation', 'datatype': {'base': 'string', 'format': 'immediate|earlier'}}, {'name': 'Certain', 'datatype': 'boolean'}) # Now we collect the data by querying the database: forms, languages = [], {} lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] pids = set() # store all meaning IDs occurring for any form upk2uid = {} # store the mapping of word pks to Form_ID, for relating loans for row in db.execute(SQL_FORMS.format(mid)): lpk, lid, lname, vspk, vid, pid, uname, upk = row upk2uid[upk] = vid ids = lids.get(lpk, {}) pids.add(pid) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) forms.append(dict( ID=vid, Language_ID=lid, Parameter_ID=pid, Form=uname, )) borrowings = [] sourceforms = {} for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))): lpk, lid, lname, form, uid, tpk, lrel, lcertain = row ids = lids.get(lpk, {}) if form != 'Unidentifiable': borrowings.append(dict( ID='{0}'.format(i + 1), Source_Form_ID=uid, Target_Form_ID=upk2uid[tpk], Relation=lrel, Certain=lcertain, )) sourceforms[uid] = dict( ID=uid, Language_ID=lid, Parameter_ID=None, Form=form, ) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) meanings = [] for row in db.execute(SQL_MEANING): id, name, semantic_category, sfid, sfname = row if id in pids: meanings.append(dict( ID=id, Name=name, Category=semantic_category, SemanticField_ID=sfid, SemanticField=sfname, )) ds.write( FormTable=forms + list(sourceforms.values()), ParameterTable=meanings, LanguageTable=languages.values(), BorrowingTable=borrowings, ) ds.validate()
from xml.etree.ElementTree import fromstring from xmljson import badgerfish as bf import sys import os import csv from cariban import util from pycldf import Wordlist import re import pyperclip lexicon = {} cariban_data = Wordlist.from_metadata("../cariban_data.json") for row in cariban_data["FormTable"]: alt_glossings = row["Glossing"].split("; ") if len(alt_glossings) == 0 or alt_glossings[0] == "": meanings = row["Parameter_ID"] else: meanings = alt_glossings lexicon[row["ID"]] = { "forms": row["Form"], "meanings": meanings, "language": row["Language_ID"], } # print(lexicon) def search_lexicon(form, meaning, language): if len(lexicon) == 0: return ("X") if not meaning.isupper(): new_meaning = meaning.replace(".", " ") else:
def add_concepticon_references( dataset: pycldf.Wordlist, gloss_languages: t.Mapping[str, str], status_update: t.Optional[str], overwrite: bool = False, ) -> None: """Guess Concepticon links for a multilingual Concept table. Fill the concepticonReference column of the dateset's ParameterTable with best guesses for Concepticon IDs, based on gloss columns in different languages. Parameters ========== dataset: A pycldf.Wordlist with a concepticonReference column in its ParameterTable gloss_languages: A mapping from ParameterTable column names to ISO-639-1 language codes that Concepticon has concept lists for (eg. en, fr, de, es, zh, pt) status_update: String written to Status_Column of #parameterTable if provided overwrite: Overwrite existing Concepticon references """ # TODO: If this function took only dataset["ParameterTable"] and the name # of the target column in there as arguments, one could construct examples # that just use the Iterable API and therefore look nice as doctests. gloss_lists: t.Dict[str, t.List[str]] = {column: [] for column in gloss_languages} for row in dataset["ParameterTable"]: for column, glosses in gloss_lists.items(): glosses.append(row[column] or "?") # Concepticon abhors empty glosses. targets = { language: concepticon.api._get_map_for_language(language, None) for language in gloss_languages.values() } cmaps: t.List[t.Dict[int, t.Tuple[t.List[int], int]]] = [ ( concept_map2( glosses, [i[1] for i in targets[gloss_languages[column]]], similarity_level=2, language=gloss_languages[column], ), # What a horrendous API! Why can't it return glosses or IDs instead # of, as it does now, target-indices so I have to schlepp target along # with the results? targets[gloss_languages[column]], ) for column, glosses in gloss_lists.items() ] write_back = [] for i, row in enumerate(dataset["ParameterTable"]): if overwrite or not row.get( dataset.column_names.parameters.concepticonReference ): matches = [(m.get(i, ([], 10)), t) for m, t in cmaps] best_sim = min(x[0][1] for x in matches) best_matches = [t[m] for (ms, s), t in matches for m in ms if s <= best_sim] c: t.Counter[str] = collections.Counter(id for id, string in best_matches) if len(c) > 1: print(row, best_sim, c.most_common()) row[ dataset.column_names.parameters.concepticonReference ] = c.most_common(1)[0][0] elif len(c) < 1: print(row) else: row[ dataset.column_names.parameters.concepticonReference ] = c.most_common(1)[0][0] # add status update if given if status_update: row["Status_Column"] = status_update write_back.append(row) dataset.write(ParameterTable=write_back)
from clldutils.path import Path from clldutils.misc import slug from pycldf import Wordlist from clld_phylogeny_plugin.models import Phylogeny, TreeLabel, LanguageTreeLabel from clld_cognacy_plugin.models import Cognate, Cognateset from csvw.dsv import reader import cobl2 from cobl2 import models import clld_cognacy_plugin.models data_file_path = Path(cobl2.__file__).parent / '../..' / 'iecor' ds = Wordlist.from_metadata(data_file_path / 'cldf' / 'cldf-metadata.json') photos = { p.stem: p.as_posix() for p in (Path(cobl2.__file__).parent / '../..' / 'CoBL-public' / 'cobl' / 'static' / 'contributors').iterdir() if p.suffix == '.jpg'} for k, v in { 'Kümmel': 'Kuemmel', 'de Vaan': 'deVaan', 'Dewey-Findell': 'Dewey', }.items(): photos[k] = photos[v] def main(args): data = Data()
def original_cldf(self): for p in self.raw_dir.iterdir(): if p.name.endswith(MD_SUFFIX): return Wordlist.from_metadata(p)
def add_cognate_table( dataset: pycldf.Wordlist, split: bool = True, logger: cli.logging.Logger = cli.logger, ) -> None: if "CognateTable" in dataset: return dataset.add_component("CognateTable") # TODO: Check if that cognatesetReference is already a foreign key to # elsewhere (could be a CognatesetTable, could be whatever), because then # we need to transfer that knowledge. # Load anything that's useful for a cognate set table: Form IDs, segments, # segment slices, cognateset references, alignments columns = { "id": dataset["FormTable", "id"].name, "concept": dataset["FormTable", "parameterReference"].name, "form": dataset["FormTable", "form"].name, } for property in [ "segments", "segmentSlice", "cognatesetReference", "alignment" ]: try: columns[property] = dataset["FormTable", property].name except KeyError: pass cognate_judgements = [] forms = cache_table(dataset, columns=columns) forms_without_segments = 0 for f, form in cli.tq(forms.items(), task="Extracting cognate judgements from forms…"): if form.get("cognatesetReference"): if split: cogset = util.string_to_id("{:}-{:}".format( form["concept"], form["cognatesetReference"])) else: cogset = form["cognatesetReference"] judgement = { "ID": f, "Form_ID": f, "Cognateset_ID": cogset, } try: judgement["Segment_Slice"] = form["segmentSlice"] except KeyError: try: if not form["segments"]: raise ValueError("No segments") if ("+" in form["segments"] and dataset["FormTable", "cognatesetReference"].separator): logger.warning( "You seem to have morpheme annotations in your cognates. I will probably mess them up a bit, because I have not been taught properly how to deal with them. Sorry!" ) judgement["Segment_Slice"] = [ "1:{:d}".format(len(form["segments"])) ] except (KeyError, TypeError, ValueError): forms_without_segments += 1 if forms_without_segments >= 5: pass else: logger.warning( f"No segments found for form {f} ({form['form']})." ) # What does an alignment mean without segments or their slices? # Doesn't matter, if we were given one, we take it. judgement["Alignment"] = form.get("alignment") cognate_judgements.append(judgement) if forms_without_segments >= 5: logger.warning( "No segments found for %d forms. You can generate segments using `lexedata.edit.segment_using_clts`.", forms_without_segments, ) # Delete the cognateset column cols = dataset["FormTable"].tableSchema.columns remove = { dataset["FormTable", c].name for c in ["cognatesetReference", "segmentSlice", "alignment"] if ("FormTable", c) in dataset } def clean_form(form): for c in remove: form.pop(c, None) return form forms = [clean_form(form) for form in dataset["FormTable"]] for c in remove: ix = cols.index(dataset["FormTable", c]) del cols[ix] dataset.write(FormTable=forms) dataset.write(CognateTable=cognate_judgements)
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid", segments="tokens", form="ipa", note='note', form_in_source="value", source=None, alignment=None): """Convert a wordlist in LingPy to CLDF. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A regular Wordlist object (or similar). path : str (default='cldf') The name of the directory to which the files will be written. source_path : str (default=None) If available, specify the path of your BibTex file with the sources. ref : str (default="cogid") The column in which the cognate sets are stored. segments : str (default="tokens") The column in which the segmented phonetic strings are stored. form : str (default="ipa") The column in which the unsegmented phonetic strings are stored. note : str (default=None) The column in which you store your comments. form_in_source : str (default=None) The column in which you store the original form in the source. source : str (default=None) The column in which you store your source information. alignment : str (default="alignment") The column in which you store the alignments. """ if not cldf: raise ValueError('The package pycldf needs to be installed') # create cldf-dataset ds = CLDF_Wordlist.in_dir(path) # add sources if they are available ds.add_sources(read_text(source_path) if source_path else '') # add components ds.add_component('LanguageTable') ds.add_component('ParameterTable', 'Concepticon_ID') ds.add_component('CognateTable') ds.add_columns('FormTable', 'form_in_source') languages, parameters, forms, cognates = {}, {}, [], [] for idx in wordlist: lid = slug(wordlist[idx, 'doculect']) if lid not in languages: languages[lid] = dict(ID=lid, Name=wordlist[idx, 'doculect'], glottocode=wordlist[idx, 'glottocode']) pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept']) if pid not in parameters: parameters[pid] = dict(ID=pid, Name=wordlist[idx, 'concept'], Concepticon_ID=wordlist[idx, 'concepticon_id']) forms.append( dict(ID=str(idx), Language_ID=lid, Parameter_ID=pid, form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '', Form=wordlist[idx, form] or '' if form else '', Segments=wordlist[idx, segments] or '' if segments else '', Source=[wordlist[idx, source]] or [] if source else [], Comment=wordlist[idx, note] or '' if note else '')) if ref: cognates.append( dict(ID=str(idx), Form_ID=str(idx), Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx, alignment] or [''] if alignment else [''])) ds.write(FormTable=forms, LanguageTable=languages.values(), ParameterTable=parameters.values(), CognateTable=cognates)
def main(args): # pragma: no cover wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json')) data = Data() data.add( common.Contributor, 'barthwolfgang', id='barthwolfgang', name="Wolfgang Barth", url="http://www.dynamicsoflanguage.edu.au/") # # FIXME: get dataset attributes from CLDF metadata! # dataset = common.Dataset( id='parabank', name='Parabank Pronouns', description='Database of pronouns', domain='parabank.clld.org', publisher_name="CoEDL Centre of Excellence for the Dynamics of Language", publisher_place="Canberra, Australia", publisher_url="http://www.dynamicsoflanguage.edu.au/", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0'}) DBSession.add(dataset) for i, editor in enumerate(['barthwolfgang']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for l in wl['LanguageTable']: lang = data.add( models.ParabankLanguage, l['ID'], id=l['ID'], name=l['Name'], description=l['Notes'], source=l['Source_Citation'], classification=l['Classification'], ) add_language_codes(data, lang, None, glottocode=l['Glottocode']) for p in wl['ParameterTable']: data.add( common.Parameter, p['ID'], id=p['ID'], name='{0} ({1})'.format(p['Name'], p['ID']), #description=p['Description'], ) for f in wl['FormTable']: vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=vsid, language=data['ParabankLanguage'][f['Language_ID']], parameter=data['Parameter'][f['Parameter_ID']], contribution=contrib) DBSession.add(models.Word( id=f['ID'], name=f['Form'], comment=f.get('Comment'), original=f['Original_parameter'], valueset=vs)) load_families( data, [(l.glottocode, l) for l in data['ParabankLanguage'].values()], glottolog_repos=args.data_file('glottolog'), isolates_icon='tcccccc')
def make_cldf(db, out, mid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = Wordlist.in_dir(out) # Source words are not coded for meaning slots, so we have to relax the schema: ds['FormTable', 'Parameter_ID'].required = False # We add the WOLD language metadata: ds.add_component('LanguageTable') # some metadata about the comparison meanings: ds.add_component('ParameterTable', 'Category', 'SemanticField_ID', 'SemanticField') # and the information on borrowings (aka loanwords): ds.add_component( 'BorrowingTable', { 'name': 'Relation', 'datatype': { 'base': 'string', 'format': 'immediate|earlier' } }, { 'name': 'Certain', 'datatype': 'boolean' }) # Now we collect the data by querying the database: forms, languages = [], {} lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] pids = set() # store all meaning IDs occurring for any form upk2uid = { } # store the mapping of word pks to Form_ID, for relating loans for row in db.execute(SQL_FORMS.format(mid)): lpk, lid, lname, vspk, vid, pid, uname, upk = row upk2uid[upk] = vid ids = lids.get(lpk, {}) pids.add(pid) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) forms.append( dict( ID=vid, Language_ID=lid, Parameter_ID=pid, Form=uname, )) borrowings = [] sourceforms = {} for i, row in enumerate(db.execute(SQL_SOURCE_FORMS.format(mid))): lpk, lid, lname, form, uid, tpk, lrel, lcertain = row ids = lids.get(lpk, {}) if form != 'Unidentifiable': borrowings.append( dict( ID='{0}'.format(i + 1), Source_Form_ID=uid, Target_Form_ID=upk2uid[tpk], Relation=lrel, Certain=lcertain, )) sourceforms[uid] = dict( ID=uid, Language_ID=lid, Parameter_ID=None, Form=form, ) languages[lpk] = dict( ID=lid, Name=lname, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), ) meanings = [] for row in db.execute(SQL_MEANING): id, name, semantic_category, sfid, sfname = row if id in pids: meanings.append( dict( ID=id, Name=name, Category=semantic_category, SemanticField_ID=sfid, SemanticField=sfname, )) ds.write( FormTable=forms + list(sourceforms.values()), ParameterTable=meanings, LanguageTable=languages.values(), BorrowingTable=borrowings, ) ds.validate()
def to_cldf(wordlist, path='cldf', source_path=None, ref="cogid", segments="tokens", form="ipa", note='note', form_in_source="value", source=None, alignment=None): """Convert a wordlist in LingPy to CLDF. Parameters ---------- wordlist : ~lingpy.basic.wordlist.Wordlist A regular Wordlist object (or similar). path : str (default='cldf') The name of the directory to which the files will be written. source_path : str (default=None) If available, specify the path of your BibTex file with the sources. ref : str (default="cogid") The column in which the cognate sets are stored. segments : str (default="tokens") The column in which the segmented phonetic strings are stored. form : str (default="ipa") The column in which the unsegmented phonetic strings are stored. note : str (default=None) The column in which you store your comments. form_in_source : str (default=None) The column in which you store the original form in the source. source : str (default=None) The column in which you store your source information. alignment : str (default="alignment") The column in which you store the alignments. """ if not cldf: raise ValueError('The package pycldf needs to be installed') # create cldf-dataset ds = CLDF_Wordlist.in_dir(path) # add sources if they are available ds.add_sources( read_text(source_path) if source_path else '') # add components ds.add_component('LanguageTable') ds.add_component('ParameterTable', 'Concepticon_ID') ds.add_component('CognateTable') ds.add_columns('FormTable', 'form_in_source') languages, parameters, forms, cognates = {}, {}, [], [] for idx in wordlist: lid = slug(wordlist[idx, 'doculect']) if lid not in languages: languages[lid] = dict( ID=lid, Name=wordlist[idx, 'doculect'], Glottocode = wordlist[idx, 'glottocode']) pid = wordlist[idx, 'concepticon_id'] or slug(wordlist[idx, 'concept']) if pid not in parameters: parameters[pid] = dict( ID=pid, Name=wordlist[idx, 'concept'], Concepticon_ID=wordlist[idx, 'concepticon_id']) forms.append(dict( ID=str(idx), Language_ID=lid, Parameter_ID=pid, form_in_source=wordlist[idx, form_in_source] or '' if form_in_source else '', Form=wordlist[idx, form] or '' if form else '', Segments=wordlist[idx, segments] or '' if segments else '', Source=[wordlist[idx, source]] or [] if source else [], Comment=wordlist[idx, note] or '' if note else '')) if ref: cognates.append(dict(ID=str(idx), Form_ID=str(idx), Cognateset_ID=wordlist[idx, ref], Alignment=wordlist[idx, alignment] or [''] if alignment else [''])) ds.write( FormTable=forms, LanguageTable=languages.values(), ParameterTable=parameters.values(), CognateTable=cognates)