def main(args): # pragma: no cover data = Data() print("Setting up dataset…") dataset = common.Dataset( id=cariban.__name__, domain="cariban.clld.org", name="Comparative Cariban Database", description="Comparative Cariban Database", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_url="https://www.eva.mpg.de", publisher_place="Leipzig", license="https://creativecommons.org/licenses/by/4.0/", contact="*****@*****.**", jsondata={'function_paradigms': []}, ) fps = [] for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: for cons in morph_func["Construction"]: fps.append({ 'Function': function, 'Construction': cons, 'Morpheme': morph_func['Morpheme']}) dataset.update_jsondata(function_paradigms=fps) DBSession.add(dataset) DBSession.flush() print("Adding contributors…") c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/") dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True)) print("Adding languages…") dialect_mapping = {} lang_shorthands = {} glottocodes = {} lang_ids = {} for lang in args.cldf["LanguageTable"]: if lang["Sampled"] == "y": language = data.add( common.Language, lang["ID"], id=lang["ID"], name=lang["Name"], latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None, longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None, jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']}, ) add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"]) if lang["Dialect_Of"] not in [None, "y"]: dialect_mapping[lang["ID"]] = lang["Dialect_Of"] lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]} glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} def get_lang_id(key): if key in lang_shorthands: lang_id = lang_shorthands[key]["ID"] elif key in glottocodes: lang_id = glottocodes[key]["ID"] elif key in lang_ids: lang_id = key else: print("Could not identify language %s" % key) return None if lang_id in dialect_mapping: lang_id = dialect_mapping[lang_id] return lang_id def get_key_and_page(source_string): if len(source_string.split("[")) > 1: bib_key = source_string.split("[")[0] pages = source_string.split("[")[1].split("]")[0] else: bib_key = source_string pages = "" return bib_key, pages print("Adding sources…") for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) print("Adding language sources…") DBSession.flush() for rec in bibtex.Database.from_file(args.cldf.bibpath): if "keywords" in rec: for keyword in rec["keywords"].split(","): if keyword in lang_shorthands: lang_id = get_lang_id(keyword.strip(" ")) if lang_id in data["Language"]: data.add(common.LanguageSource, rec.id+lang_id, language_pk=data["Language"][lang_id].pk, source_pk=data["Source"][rec.id].pk ) data.add( common.Source, "pc", id="pc", name="Personal communication", description="Placeholder for data obtained from personal communication.", bibtex_type=bibtex.EntryType.misc ) # print("Adding glossing abbreviations…") # length = len(pynterlinear.get_all_abbrevs().keys()) # for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()): # print("%s/%s" % (i+1, length), end="\r") # DBSession.add(common.GlossAbbreviation(id=key, name=name)) # print("") # print("Adding examples…") gloss_replacements = { "S_A_": "Sa", "S_P_": "Sp" } def clldify_glosses(gloss_line): for orig, new in gloss_replacements.items(): gloss_line = gloss_line.replace(orig,new) gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line) return gloss_line for ex in args.cldf["ExampleTable"]: lang_id = get_lang_id(ex["Language_ID"]) new_ex = data.add(common.Sentence, ex["ID"], id=ex["ID"], name=ex["Name"], description=ex["Translated_Text"], analyzed="\t".join(ex["Analyzed_Word"]), gloss=clldify_glosses("\t".join(ex["Gloss"])), language=data["Language"][lang_id], comment=ex["Comment"], markup_gloss="\t".join(ex["Morpheme_IDs"]) ) if ex["Source"]: bib_key, pages = get_key_and_page(ex["Source"]) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(common.SentenceReference( sentence=new_ex, source=source, key=source.id, description=pages.replace("--","–")) ) def add_morpheme_reference(morpheme, source_string): bib_key, pages = get_key_and_page(source_string) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.MorphemeReference( morpheme=morpheme, source=source, key=source.id, description=pages.replace("--","–") ) ) print("Adding morphemes…") for morph in args.cldf["FormTable"]: lang_id = get_lang_id(morph["Language_ID"]) form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ") new_morph = data.add(models.Morpheme, morph["ID"], morpheme_type="grammatical", language=data["Language"][lang_id], name="/".join(form), id=morph["ID"], ) if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0]) print("Adding constructions…") data.add(models.DeclarativeType, "imp", id="imp", name="imperative") data.add(models.DeclarativeType, "decl", id="decl", name="declarative") data.add(models.MainClauseVerb, "y", id="y", name="main clause construction") data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction") for cons in args.cldf["ParameterTable"]: lang_id = get_lang_id(cons["Language_ID"]) new_construction = data.add( models.Construction, cons["ID"], id=cons["ID"], language=data["Language"][lang_id], name=cons["Description"], mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]], ) if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]] def add_morph_func(morpheme, func_key, construction): data.add(models.MorphemeFunction, "%s:%s" % (morpheme, function), id="%s:%s" % (morpheme, func_key), name="MorphemeFunction %s:%s"% (morpheme, func_key), unit=data["Morpheme"][morpheme], unitparameter=data["Meaning"][function], construction=construction ) print("Adding morpheme functions…") for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: func_key = function.replace(".","_") if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"): meaning_type="inflectional" else: meaning_type="derivational" if function not in data["Meaning"]: data.add(models.Meaning, function, id=func_key, name=function, meaning_type=meaning_type ) #Only some morpheme functions are specified as occurring in specific constructions if len(morph_func["Construction"]) == 0: for morpheme in morph_func["Morpheme"]: add_morph_func(morpheme, func_key, None) else: for construction in morph_func["Construction"]: if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?": for morpheme in morph_func["Morpheme"]: if data["Morpheme"][morpheme].language != data["Construction"][construction].language: print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % ( data["Morpheme"][morpheme].language, data["Morpheme"][morpheme], data["Construction"][construction].language, data["Construction"][construction] ) ) cons_func_key = func_key + ":" + construction add_morph_func(morpheme, cons_func_key, data["Construction"][construction]) print("Checking examples for illustrated morphemes…") proto_languages = ["pc"] is_illustrated = {} for key, row in data["MorphemeFunction"].items(): if row.unit.language.id in proto_languages: continue is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False for row in args.cldf["ExampleTable"]: for word in row["Morpheme_IDs"]: morph_ids = util.split_word(word) for unit_value in morph_ids: if unit_value in ["X","-","=", "~"]: continue unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"]) if unitvaluesentence_key in data["UnitValueSentence"].keys(): continue is_illustrated[unit_value] = True morph_id = unit_value.split(":")[0] if morph_id not in data["Morpheme"].keys(): print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id)) elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language: print("Warning: The %s example %s claims to contain the %s morpheme %s." % ( data["Sentence"][row["ID"]].language, row["ID"], data["Morpheme"][morph_id].language, data["Morpheme"][morph_id] ) ) if ":" not in unit_value: print("%s in %s contains no defined function!" % (unit_value, row["ID"])) function = unit_value.split(":")[1] morph_function_id = "%s:%s" % (morph_id, function) if morph_function_id not in data["MorphemeFunction"].keys(): print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-"))) continue data.add(models.UnitValueSentence, unitvaluesentence_key, sentence=data["Sentence"][row["ID"]], unitvalue=data["MorphemeFunction"][morph_function_id], ) # see how many morpheme functions are illustrated with example sentences good_ill = [key for key, value in is_illustrated.items() if value] not_ill = [key for key, value in is_illustrated.items() if not value] not_ill.sort() cov = len(good_ill)/len(is_illustrated)*100 print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2))) f = open("../unillustrated_morphemes.txt", "w") for morph in not_ill: f.write(morph+"\n") f.close() print("Adding cognate sets…") for cogset in args.cldf["CognatesetTable"]: new_cset = data.add(models.Cognateset, cogset["ID"], id=cogset["ID"], name=cogset["Name"], description=cogset["Function"], cogset_type="grammatical" ) if cogset["Source"]: for source in cogset["Source"]: bib_key, pages = get_key_and_page(source) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=new_cset, source=source, key=source.id, description=pages) ) print("Adding cognates…") for morph in args.cldf["FormTable"]: for cognate_ID in morph["Cognateset_ID"]: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=data["Morpheme"][morph["ID"]] ) ) print("Adding morpheme comments…") for row in args.cldf["FormTable"]: data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"]) print("Adding construction descriptions…") for cons in args.cldf["ParameterTable"]: if cons["Comment"] is None: description = "" else: description = util.generate_markup(cons["Comment"]) description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"])) description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"])) data["Construction"][cons["ID"]].markup_description = description print("Adding cognate set descriptions…") for cogset in args.cldf["CognatesetTable"]: data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"]) # if cogset["ID"] == "13pro": # data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup( # util.comparative_function_paradigm( # ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"], # "1+3 scenarios", # ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"])) def add_tree_labels(phylo): uncertain_nodes = [] for node in phylo.find_clades(): if node.name == None or not node.is_terminal(): continue plain_name = node.name.replace("?","") if "?" in node.name: uncertain_nodes.append(plain_name) if plain_name in lang_ids: node.name = lang_ids[plain_name]["Name"].replace("'", "’") if plain_name in uncertain_nodes: node.name += "?" return phylo, uncertain_nodes print("Adding trees…") own_trees = ["matter"] tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw') newick_files = {} for tree in args.cldf["cariban_trees.csv"]: if tree["ID"] in own_trees: continue newick_files[tree["ID"]] = { "orig": tree["ID"]+"_orig.newick", "norm": tree["ID"]+"_norm.newick", "source": tree["Source"], "comment": tree["Comment"], "o_comment": tree["Orig_Comment"] } #adding my own trees separately. for my_tree_count, tree_id in enumerate(own_trees): my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick") my_tree, uncertain_nodes = add_tree_labels(my_tree) edited_tree = io.StringIO() Phylo.write(my_tree, edited_tree, "newick") tree = edited_tree.getvalue().replace(":0.00000","") my_phylo = Phylogeny( tree_id, id=tree_id, name="Matter (2020)",# % str(my_tree_count+1), newick=tree, markup_description="My own, conservative, classification." ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=my_phylo ) ) DBSession.add(my_phylo) #adding the other trees for tree_id, values in newick_files.items(): norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick") orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick") norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree) edited_tree = io.StringIO() Phylo.write(norm_biotree, edited_tree, "newick") norm_tree = edited_tree.getvalue().replace(":0.00000","") edited_tree = io.StringIO() Phylo.write(orig_biotree, edited_tree, "newick") orig_tree = edited_tree.getvalue().replace(":0.00000","") norm_phylo = Phylogeny( id=tree_id+"_norm", name=str(data["Source"][values["source"]]) + " (Normalized)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] ), newick=norm_tree ) if values["o_comment"] == None: o_comment = "" else: o_comment = values["o_comment"] orig_phylo = Phylogeny( id=tree_id+"_orig", name=str(data["Source"][values["source"]]) + " (Original)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] + " " + o_comment ), newick=orig_tree ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=norm_phylo ) ) DBSession.add(norm_phylo) DBSession.add(orig_phylo) print("Adding t-adding verb cognate sets…") for t_verb_set in args.cldf["cariban_t_cognates.csv"]: cognate_ID = "t"+t_verb_set["ID"] rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"]) t_cogset = data.add(models.Cognateset, cognate_ID, id=cognate_ID, name=rec_t_form, description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"], cogset_type="t_adding" ) if t_verb_set["Source"]: bib_key = t_verb_set["Source"].split("[")[0] if len(t_verb_set["Source"].split("[")) > 1: pages = t_verb_set["Source"].split("[")[1].split("]")[0] else: pages = " " if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=t_cogset, source=source, key=source.id, description=pages) ) print("Adding t-adding verbs…") t_langs = {} t_verbs = {} non_t_adding_lgs = ["ing","mac","kar","wmr","pan"] data.add(models.Meaning, "t_verb", id="t-verb", name="t-adding verb", ) for t_verb_entry in args.cldf["cariban_t_verbs.csv"]: if t_verb_entry["Language_ID"] == "cari1283": continue cognate_ID = "t"+t_verb_entry["Cognateset_ID"] lang_id = get_lang_id(t_verb_entry["Language_ID"]) morph_id = lang_id+"_"+cognate_ID if morph_id in data["Morpheme"].keys(): if morph_id + "_2" in data["Morpheme"].keys(): morph_id += "_3" else: morph_id += "_2" t_verb = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="t_adding", name=t_verb_entry["Form"], language=data["Language"][lang_id], ) DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=t_verb ) ) if t_verb_entry["t"] == "y": t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name) t_verb.markup_description = util.generate_markup("Shows cogset:t") if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs: t_verb.name = "[t-?]"+t_verb.name t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t") if t_verb_entry["t"] == "n": t_verb.markup_description = util.generate_markup("Does not show cogset:t") if lang_id not in t_langs.keys(): t_langs[lang_id] = {"y": 0, "n": 0, "?": 0} if cognate_ID not in t_verbs.keys(): t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0} t_langs[lang_id][t_verb_entry["t"]] += 1 if lang_id not in non_t_adding_lgs: t_verbs[cognate_ID][t_verb_entry["t"]] += 1 if t_verb_entry["Source"]: add_morpheme_reference(t_verb, t_verb_entry["Source"]) data.add(models.MorphemeFunction, "t_"+t_verb_entry["ID"], id="t_"+t_verb_entry["ID"], name="t-Verb %s" % t_verb_entry["Parameter_ID"], unit=t_verb, unitparameter=data["Meaning"]["t_verb"], construction=None ) for lang, values in t_langs.items(): data["Language"][lang].update_jsondata(t_values=values) for verb, values in t_verbs.items(): # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"])) data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))) print("Adding reconstructed lexemes…") proto_forms = {} for cogset in args.cldf["cariban_lexical_reconstructions.csv"]: proto_forms[cogset["ID"]] = cogset["Form"] first_found = [] for entry in args.cldf["cariban_swadesh_list.csv"]: cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"] if cognateset_ID not in data["Cognateset"]: if cognateset_ID in proto_forms: form = "*" + proto_forms[cognateset_ID].replace("; ", " / ") # else: # form = "" data.add(models.Cognateset, cognateset_ID, id=cognateset_ID, name=form, description=cognateset_ID, cogset_type="lexical" ) lang_id = get_lang_id(entry["Language_ID"]) if lang_id not in data["Language"]: continue function = entry["Parameter_ID"].replace(".","_") morph_id = entry["Language_ID"] + "_" + function if morph_id in first_found: continue first_found.append(morph_id) if function not in data["Meaning"].keys(): data.add(models.Meaning, function, id=function, name=function, meaning_type="lexical" ) morpheme = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="lexical", name=entry["Value"][0], language=data["Language"][lang_id], ) data.add(models.MorphemeFunction, "%s:%s" % (morph_id, function), id="%s:%s" % (morph_id, function), name="MorphemeFunction %s:%s"% (morph_id, function), unit=data["Morpheme"][morph_id], unitparameter=data["Meaning"][function], construction=None ) if entry["Source"]: add_morpheme_reference(morpheme, entry["Source"]) if cognateset_ID in proto_forms: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognateset_ID], counterpart=morpheme ) )
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): _ = args data = Data() cldf_data = args.cldf data.add(common.Contributor, 'fehnannemarie', id='fehnannemarie', name="Anne-Marie Fehn", url="https://shh.mpg.de") # TODO: Editors/Contributors dataset = common.Dataset(id=kba.__name__, name="KBA", publisher_name="Max Planck Institute for the " "Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by" "/4.0/", domain='kba.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons ' 'Attribution 4.0 ' 'International ' 'License' }) DBSession.add(dataset) for i, editor in enumerate(['fehnannemarie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for language in cldf_data['LanguageTable']: lang = data.add(models.KbaLanguage, language['ID'], id=language['ID'], name=language['Name']) add_language_codes(data, lang, None, glottocode=language['Glottocode']) # TODO: Concepticon for parameter in cldf_data['ParameterTable']: data.add(common.Parameter, parameter['ID'], id=parameter['ID'], name='{0} ({1})'.format(parameter['Name'], parameter['ID'])) for form in cldf_data['FormTable']: valueset_id = '{0}-{1}'.format(form['Parameter_ID'], form['Language_ID']) valueset = data['ValueSet'].get(valueset_id) # Unless we already have something in the VS: if not valueset: valueset = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data['KbaLanguage'][form['Language_ID']], parameter=data['Parameter'][form['Parameter_ID']], contribution=contrib) DBSession.add( models.Word(id=form['ID'], name=form['Form'], comment=form.get('Comment'), sourceorthography=form.get('sourceorthography'), kbaorthography=form.get('kbaorthography'), wordclass=form.get('wordclass'), grammaticalnotes=form.get('grammaticalnotes'), idiolectalvariant=form.get('idiolectalvariant'), originaltranslation=form.get('originaltranslation'), valueset=valueset)) load_families(data, [(l.glottocode, l) for l in data['KbaLanguage'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc')
def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): data = Data() icons = cycle(ORDERED_ICONS) dataset = common.Dataset( id=gelato.__name__, name="GeLaTo", description="Genes and Languages together", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='gelato.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'), ('blasidamian', 'Damián Blasi'), ('forkelrobert', 'Robert Forkel')]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) families = {} for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for r in args.cldf.iter_rows('ContributionTable', 'id', 'name', 'description'): ds = data.add(models.Panel, r['id'], id=r['id'], name=r['name'], description=r['description']) for row in args.cldf.iter_rows('LanguageTable', 'id', 'name', 'contributionReference'): icon = families.get(row['LanguageFamily_Glottocode']) if not icon: families[row['LanguageFamily_Glottocode']] = icon = next(icons) lang = data['Languoid'].get(row['Glottocode']) if not lang: lang = data.add( models.Languoid, row['Glottocode'], id=row['Glottocode'], name=row['Language_Name'], family_id=row['LanguageFamily_Glottocode'], family_name=row['LanguageFamily'], jsondata=dict(icon=icon.name), ) s = data.add( models.Sample, row['id'], id=row['id'], name=row['Name'], panel=data['Panel'][row['contributionReference']], languoid=lang, latitude=row['Latitude'], longitude=row['Longitude'], samplesize=int(row['samplesize']), #source=row.get('dataSet.of.origin'), region=row['geographicRegion'], #location=row['location'], jsondata=dict(color=REGIONS[row['geographicRegion']]), ) DBSession.flush() for bibkey in row['Source']: DBSession.add( common.LanguageSource(language_pk=s.pk, source_pk=data['Source'][bibkey].pk)) types = {} for row in args.cldf.iter_rows('ParameterTable', 'id', 'name', 'description', 'contributionReference'): types[row['id']] = Datatype.fromvalue(row['datatype']) data.add(models.Measure, row['id'], id=row['id'], name=row['name'], description=row['description'], panel=data['Panel'][row['contributionReference']]) for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference', 'languageReference'): v = types[row['parameterReference']].read(row['Value']) if isinstance(v, float): vs = data.add( common.ValueSet, row['id'], id=row['id'], language=data['Sample'][row['languageReference']], parameter=data['Measure'][row['parameterReference']], #contribution=ds, #jsondata=dict(color=REGIONS[sample.region]), ) data.add(models.Measurement, row['id'], id=row['id'], valueset=vs, name=row['Value'], value=v)
def main(args): # pragma: no cover data = Data() clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') ds = data.add( common.Dataset, tppsr.__name__, id=tppsr.__name__, name='Tableaux phonétiques des patois suisses romands Online', domain='tppsr.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}, ) for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']): common.Editor( dataset=ds, ord=i, contributor=common.Contributor(id=slug(HumanName(name).last), name=name) ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['Number'], name=lang['name'], description=lang['FullName'], latitude=lang['latitude'], longitude=lang['longitude'], canton=lang['Canton'], group=lang['DialectGroup'], recorded=lang['DateOfRecording'], population=int(lang['Population']) if lang['Population'] else None, speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None, speaker_proficiency=lang['SpeakerProficiency'], speaker_language_use=lang['SpeakerLanguageUse'], speaker_gender=lang['SpeakerGender'], investigators=lang['Investigators'], ) colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol') for i, (_, langs) in enumerate(itertools.groupby( sorted(data['Variety'].values(), key=lambda l: l.canton), lambda l: l.canton, )): for lang in langs: lang.update_jsondata(color=colors[i]) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['Number'], number=int(param['Number']), name='{} [{}]'.format(param['name'], param['Number']), latin_gloss=param['Latin_Gloss'], french_gloss=param['French_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], concepticon_concept_id=param['id'].split('_')[0], ) inventories = collections.defaultdict(set) scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'): if not form['form']: continue inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) f = data.add( models.Form, form['id'], # Gauchat-1925-480-1_ id=form['id'], name=form['form'].replace('+', ' '), description=form['value'], segments=' '.join(form['Segments']), valueset=vs, scan=scan_url_template.expand(**form), prosodic_structure=form['ProsodicStructure'], ) for example in args.cldf['ExampleTable']: sentence = models.Phrase( id=example['ID'], language=data['Variety'][example['Language_ID']], name=example['Primary_Text'], description=example['Translated_Text'], original_script=example['Alt_Transcription'], ) for cid in example['Concept_ID']: DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence)) for fid in example['Form_ID']: DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence)) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add(common.ValueSetReference( valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)) ))
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): # pragma: no cover get_repos() api = Grambank(REPOS['Grambank']) cldf = args.cldf data = Data() dataset = models.Grambank( id=grambank.__name__, name="Grambank", description="Grambank", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) contributors = {} for i, contrib in enumerate(api.contributors): contrib = common.Contributor( contrib.id, id=contrib.id, name=contrib.name, ) common.Editor(dataset=dataset, contributor=contrib, ord=i) DBSession.add(contrib) DBSession.flush() contributors[contrib.id] = contrib.pk contributions = {r['ID']: r for r in cldf['LanguageTable']} DBSession.add(dataset) for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)), desc='sources'): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() sources = {k: v.pk for k, v in data['Source'].items()} features, codes = import_features(cldf, contributors) transaction.commit() values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby( sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']), lambda r: r['Language_ID'], )] for lid, values in tqdm(values_by_sheet, desc='loading values'): transaction.begin() import_values(values, contributions[lid], features, codes, contributors, sources) transaction.commit() transaction.begin() glottolog = Glottolog(REPOS['glottolog']) languoids = {l.id: l for l in glottolog.languoids()} gblangs = DBSession.query(models.GrambankLanguage).all() load_families(data, gblangs, glottolog_repos=REPOS['glottolog'], isolates_icon='dcccccc') # Add isolates for lg in gblangs: gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family coverage.main(glottolog) return
def main(args): data = Data() doi = input('DOI of the released dataset: ') dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date.today(), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'doi': doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) ed_pattern = re.compile('ed(?P<ord>[0-9]+)$') for c in args.cldf['contributors.csv']: contrib = data.add( models.WaveContributor, c['ID'], id=c['ID'], name=c['Name'], email=c['Email'], url=c['URL'], address=c['Address'], sortkey=HumanName(c['Name']).last, ) m = ed_pattern.match(c['ID']) if m: common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord'))) for fc in args.cldf['featurecategories.csv']: data.add( models.FeatureCategory, fc['ID'], id=fc['ID'], name=fc['Name'], description=fc['Description']) for vt in args.cldf['varietytypes.csv']: data.add( models.VarietyType, vt['ID'], id=vt['ID'], name=vt['Name'], description=vt['Description'], jsondata=VARIETY_TYPE_ICONS[vt['ID']], ) for vt in args.cldf['regions.csv']: data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name']) for lang in args.cldf['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], abbr=lang['abbr'], region=data['Region'][lang['Region_ID']], type=data['VarietyType'][lang['Type_ID']], ) if lang['Glottocode']: add_language_codes(data, l, None, glottocode=lang['Glottocode']) c = data.add( models.WaveContribution, lang['ID'], id=str(lang['ID']), name=lang['Name'], description=lang['Description'], variety=l) for i, cid in enumerate(lang['Contributor_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['WaveContributor'][cid], ord=i+1, )) for param in args.cldf['ParameterTable']: data.add( models.Feature, param['ID'], id=param['ID'], category=data['FeatureCategory'][param['Category_ID']], name=param['Name'], description=param['Description'], jsondata={'example_source': param['Example_Source']}) for de in args.cldf['CodeTable']: data.add( common.DomainElement, de['ID'], id=de['ID'], parameter=data['Feature'][de['Parameter_ID']], name=de['Name'], description=de['Description'], jsondata={'color': CODE_COLORS[de['Name']]}, number=de['Number']) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for example in args.cldf['ExampleTable']: s = data.add( common.Sentence, example['ID'], id=example['ID'], name=example['Primary_Text'], gloss='\t'.join(example['Gloss']) if example['Gloss'] else None, comment=example['Comment'] or None, description=example['Translated_Text'] or None, language=data['Variety'][example['Language_ID']]) for ref in example['Source']: sid, pages = Sources.parse(ref) DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][sid], description=pages, key=sid)) for value in args.cldf['ValueTable']: de = data['DomainElement'][value['Code_ID']] vs = data.add( common.ValueSet, value['ID'], id=value['ID'], contribution=data['WaveContribution'][value['Language_ID']], parameter=data['Feature'][value['Parameter_ID']], jsondata=de.jsondata, language=data['Variety'][value['Language_ID']]) v = data.add( common.Value, value['ID'], id=value['ID'], domainelement=de, valueset=vs) for eid in value['Example_ID']: DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
def main(args): internal = input( '[i]nternal or [e]xternal data (default: e): ').strip().lower() == 'i' which_submission = input( "submission id or 'all' for all submissions (default: all): ").strip( ).lower() or 'all' data = Data() dataset = common.Dataset( id=crossgram.__name__, name='Crossgram', description='Crossgram', published=date(2019, 12, 12), domain='crossgram.clld.org', # XXX Is any of this correct? publisher_name='Max Planck Institute for the Science of Human History', publisher_place='Jena', publisher_url='https://ssh.mpg.de', license='http://creativecommons.org/licenses/by/4.0', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) internal_repo = pathlib.Path('../../crossgram/crossgram-internal') cache_dir = internal_repo / 'datasets' cache_dir.mkdir(exist_ok=True) if internal: submissions_path = internal_repo / 'submissions-internal' else: submissions_path = internal_repo / 'submissions' language_id_map = {} for contrib_dir in submissions_path.iterdir(): if not contrib_dir.is_dir(): continue if which_submission != 'all' and which_submission != contrib_dir.name: continue sid = contrib_dir.name print('Loading submission', sid, '...') contrib_md = jsonlib.load(contrib_dir / 'md.json') intro = None try: with (contrib_dir / 'intro.md').open(encoding='utf-8') as f: intro = f.read() except IOError: # If there is no intro, there is no intro *shrug* pass path = download_data(sid, contrib_md, cache_dir) if not path.exists(): print('could not find folder', str(path)) continue submission = CLDFBenchSubmission.load(path, contrib_md) date_match = re.fullmatch('(\d+)-(\d+)-(\d+)', contrib_md['published']) assert date_match yyyy, mm, dd = date_match.groups() published = date(int(yyyy), int(mm), int(dd)) # strip off ssh stuff off git link git_https = re.sub('^git@([^:]*):', r'https://\1/', contrib_md.get('repo') or '') contrib = data.add(models.CrossgramData, sid, id=sid, number=int(contrib_md['number']), published=published, name=submission.title, doi=contrib_md.get('doi'), git_repo=git_https, description=intro or submission.readme) submission.add_to_database(data, language_id_map, contrib) print('... done') DBSession.flush() print('Loading language family data...') catconf = cldfcatalog.Config.from_file() glottolog_path = catconf.get_clone('glottolog') load_families(Data(), [ v for v in DBSession.query(models.Variety) if re.fullmatch('[a-z]{4}[0-9]{4}', v.id) ], strict=False, glottolog_repos=glottolog_path) print('... done')
def main(args): data = Data() dataset = common.Dataset( id=amsd.__name__, name="AMSD", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='amsd.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Piers Kelly', None)]) # data_entry => Contributor for row in sorted(dicts('data_entry'), key=lambda x: [x['name'].lower()]): if row['name'] in editors: editors[row['name']] = row['pk'] data.add( common.Contributor, row['pk'], id=row['pk'], name=row['name'] ) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1) for row in dicts('source_citation'): data.add( common.Source, row['pk'], id=row['pk'], note=row['name'], name=row['name'], ) for row in dicts('ling_area'): data.add( models.ling_area, row['pk'], chirila_name=row['chirila_name'], austlang_code=row['austlang_code'], austlang_name=row['austlang_name'], glottolog_code=row['glottolog_code'], ) fd = {} for row in dicts('linked_filenames'): if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']: fd[row['pk']] = dict( name=row['name'], oid=row['oid'], path=row['path'], mimetype=mimetypes.guess_type(row['path'])[0] if row['path'] else None, ) for m in 'item_type technique keywords material source_type '\ 'sem_domain holder_file item_subtype cultural_region'.split(): for row in dicts(m): data.add( getattr(models, m), row['pk'], name=row['name'], ) DBSession.flush() # sticks => MessageStick no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type', 'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry', 'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file'] x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords', 'holder_file', 'item_type', 'item_subtype', 'cultural_region'] for i, row in enumerate(dicts('sticks')): fts_items = [] for col in row.keys(): if col: if col == 'amsd_id': fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),) elif col not in no_fts_cols and not col.endswith('_pk'): fts_items.append(row[col]) for t in x_cols: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(str(data[t][k])) fts_items.extend(str(data[t][k]).split('_')) for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(data['ling_area'][k].chirila_name) fts_items.append(data['ling_area'][k].austlang_code) fts_items.append(data['ling_area'][k].austlang_name) fts_items.append(data['ling_area'][k].glottolog_code) if row['source_citation']: for k in row['source_citation'].split(';'): data.add( common.ContributionReference, k, contribution_pk=int(row['pk']), source_pk=int(k), ) fts_items.append(str(data['Source'][k])) if row['linked_filenames']: for j, k in enumerate(row['linked_filenames'].split(';')): if k in fd: oid = fd[k].get('oid') mt = fd[k].get('mimetype') refobjid = '' if mt == 'application/pdf': refobjid = oid # use for web, thumbnail a place holder image oid = 'EAEA0-52CC-0295-6B71-0' n = fd[k].get('name') data.add( common.Contribution_files, k, id='%s-%s-%i' % (k, row['pk'], j), object_pk=int(row['pk']), name=n, jsondata=dict( original=fd[k].get('path'), objid=oid, refobjid=refobjid, web='web.jpg', thumbnail='thumbnail.jpg', ), ord=j, mime_type=mt, ) fts_items.append(n) fts_items.extend(nfilter(re.split(r'[_\-\.]', n))) data.add( models.MessageStick, row['pk'], id=row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i), title=row['title'], description=row['description'], obj_creator=row['obj_creator'], date_created=row['date_created'], note_place_created=row['note_place_created'], place_created=row['place_created'], item_type_pk=row['item_type'] or None, item_subtype_pk=row['item_subtype'] or None, cultural_region_pk=row['cultural_region'] or None, ling_area_1_pk=row['ling_area_1'] or None, ling_area_2_pk=row['ling_area_2'] or None, ling_area_3_pk=row['ling_area_3'] or None, notes_ling_area=row['notes_ling_area'], stick_term=row['stick_term'], message=row['message'], motifs=row['motifs'], motif_transcription=row['motif_transcription'], dim_1=row['dim_1'], dim_2=row['dim_2'], dim_3=row['dim_3'], date_collected=row['date_collected'], holder_file_pk=row['holder_file'] or None, holder_obj_id=row['holder_obj_id'], collector=row['collector'], place_collected=row['place_collected'], creator_copyright=row['creator_copyright'], file_copyright=row['file_copyright'], latitude=row['lat'] or None, longitude=row['long'] or None, notes_coords=row['notes_coords'], url_institution=row['url_institution'], url_source_1=row['url_source_1'], url_source_2=row['url_source_2'], irn=row['irn'], notes=row['notes'], data_entry=row['data_entry'], fts=fts.tsvector('\n'.join(re.sub(r'[_\-]', '.', v) for v in fts_items if v)), ) DBSession.flush() for row in dicts('sticks'): for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']: if row[t]: for _, k in enumerate(row[t].split(';')): data.add( getattr(models, 'x_%s' % (t)), k, object_pk=int(row['pk']), item_pk=int(k), )
def main(args, repos=None): glottolog = get_glottolog_api(repos) fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="{0} {1}".format(glottolog.publication.web.name, version), publisher_name=glottolog.publication.publisher.name, publisher_place=glottolog.publication.publisher.place, publisher_url=glottolog.publication.publisher.url, license=glottolog.publication.license.url, domain=purl.URL(glottolog.publication.web.url).domain(), contact=glottolog.publication.web.contact, jsondata={ 'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name}, ) data = Data() for e in glottolog.current_editors: ed = data.add(common.Contributor, e.id, id=e.id, name=e.name) common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord)) DBSession.add(dataset) contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog') DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['hammarstroem'])) # # Add Parameters: # add = functools.partial(add_parameter, data) add('fc', name='Family classification') add('sc', name='Subclassification') add('aes', glottolog.aes_status.values(), name=glottolog.aes_status.__defaults__['name'], pkw=dict( jsondata=dict( reference_id=glottolog.aes_status.__defaults__['reference_id'], sources=[attr.asdict(v) for v in glottolog.aes_sources.values()], scale=[attr.asdict(v) for v in glottolog.aes_status.values()])), dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)), ) add('med', glottolog.med_types.values(), name='Most Extensive Description', dekw=lambda de: dict( name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)), ) add('macroarea', glottolog.macroareas.values(), pkw=dict( description=glottolog.macroareas.__defaults__['description'], jsondata=dict(reference_id=glottolog.macroareas.__defaults__['reference_id'])), dekw=lambda de: dict( name=de.name, description=de.description, jsondata=dict(geojson=read_macroarea_geojson(glottolog, de.name, de.description)), ), ) add('ltype', glottolog.language_types.values(), name='Language Type', dekw=lambda de: dict(name=de.category, description=de.description), delookup='category', ) add('country', glottolog.countries, dekw=lambda de: dict(name=de.id, description=de.name), ) legacy = jsonlib.load(gc2version()) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) # # Now load languoid data, keeping track of relations that can only be inserted later. # lgsources = defaultdict(list) # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`: nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()]) lgcodes = {k: v.id for k, v in glottolog.languoids_by_code(nodemap).items()} for lang in tqdm(list(nodemap.values())): for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(glottolog, data, lang, nodemap) for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: mas = [] for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma) mas.append(ma.name) ref.macroareas = ', '.join(mas)
def main(args): for (org, repos), recs in itertools.groupby( sorted(oai.Records('tular'), key=lambda r: (r.repos.org, r.repos.repos, r.version), reverse=True), lambda r: (r.repos.org, r.repos.repos), ): if org == 'tupian-language-resources' and repos in DATASETS: DATASETS[repos] = next(recs) data = Data() dataset = data.add( common.Dataset, 'tular', id=tular.__name__, domain="tular.clld.org", name="TuLaR", description="Tupían Language Resources", publisher_name="Max-Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", license='https://creativecommons.org/licenses/by-sa/4.0/', contact="*****@*****.**", jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, ) rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve() root = input('Project dir [{}]: '.format(str(rd))) root = pathlib.Path(root) if root else rd clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data') for db, rec in DATASETS.items(): print(db, rec.doi, rec.tag) dbdir = root.joinpath(db) assert dbdir.exists() md = jsonlib.load(dbdir / 'metadata.json') name = md['title'] if md['description']: name += ': {}'.format(md['description']) contribution = data.add( Database, db, id=db, name=name, description=rec.citation if rec else None, doi=rec.doi if rec else None, ) header, contribs = next( iter_markdown_tables( dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8'))) for i, contrib in enumerate(contribs): contrib = dict(zip(header, contrib)) cid = slug(HumanName(contrib['Name']).last) contributor = data['Contributor'].get(cid) if not contributor: contributor = data.add( common.Contributor, cid, id=cid, name=contrib['Name'], description=contrib.get('Affiliation'), ) DBSession.add( common.ContributionContributor( contribution=contribution, contributor=contributor, primary='author' in contrib['Role'].lower(), ord=i, )) for i, cid in enumerate( ['gerardi', 'reichert', 'aragon', 'list', 'forkel']): DBSession.add( common.Editor(contributor=data['Contributor'][cid], dataset=dataset, ord=i)) source_ids = list(add_sources(args.cldf.bibpath, DBSession)) sources = {s.id: s.pk for s in DBSession.query(common.Source)} subgroups = [] for row in args.cldf['LanguageTable']: if row['SubGroup'] not in subgroups: subgroups.append(row['SubGroup']) family = data['Family'].get(row['Family']) if (not family) and row['Family']: family = data.add(Family, row['Family'], id=slug(row['Family']), name=row['Family']) data.add( Doculect, row['ID'], id=row['ID'], name=row['Name'].replace('_', ' '), family=family, subfamily=row['SubGroup'], iso_code=row['ISO639P3code'], glotto_code=row['Glottocode'], longitude=row['Longitude'], latitude=row['Latitude'], jsondata=dict(icon=SUBGROUPS[row['SubGroup']]), ) tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' / 'Generic-metadata.json') seen = set() for row in tudet['ExampleTable']: if row['ID'] in seen: print('skipping duplicate sentence ID {}'.format(row['ID'])) continue seen.add(row['ID']) DBSession.add( Example(id=row['ID'], name=row['Primary_Text'], description=row['Translated_Text'], language=data['Doculect'][row['Language_ID']], conllu=row['conllu'])) contrib = data['Database']['tuled'] for row in args.cldf['ParameterTable']: data.add( Concept, row['ID'], id=row['ID'].split('_')[0], name=row['Name'], portuguese=row['Portuguese_Gloss'], semantic_field=row['Semantic_Field'], concepticon_class=row['Concepticon_ID'], eol=row['EOL_ID'], ) for (lid, pid), rows in itertools.groupby( sorted(args.cldf.iter_rows('FormTable', 'languageReference', 'parameterReference'), key=lambda r: (r['Language_ID'], r['Parameter_ID'])), lambda r: (r['Language_ID'], r['Parameter_ID']), ): vsid = '{}-{}'.format(lid, pid) vs = data.add( common.ValueSet, vsid, id=vsid, language=data['Doculect'][lid], parameter=data['Concept'][pid], contribution=contrib, ) refs = set() for row in rows: data.add( Word, row['ID'], id=row['ID'], valueset=vs, name=row['Form'], tokens=' '.join(row['Segments']), simple_cognate=int(row['SimpleCognate']), notes=row['Comment'], morphemes=' '.join(row['Morphemes']), partial_cognate=' '.join([k for k in row['PartialCognates']]) if row['PartialCognates'] else None, ) refs = refs.union(row['Source']) for ref in refs: if ref in source_ids: DBSession.add( common.ValueSetReference(valueset=vs, source_pk=sources[slug( ref, lowercase=False)])) load_inventories(args.cldf, clts, data['Doculect']) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add( Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'], name=row['Cognateset_ID'], contribution=contrib, ) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Word'][row['Form_ID']], alignment=' '.join(row['Alignment'] or []), )
def main(args): data = Data() dataset = common.Dataset( id=cobl2.__name__, name="IE-CoR", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='iecor.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Heggarty', None), ('Anderson', None), ('Scarborough', None)]) for row in sorted(ds['authors.csv'], key=lambda x: [ x['Last_Name'].lower(), x['First_Name'].lower()]): if row['Last_Name'] in editors: editors[row['Last_Name']] = row['ID'] data.add( models.Author, row['ID'], id=row['ID'], name='{0} {1}'.format(row['First_Name'], row['Last_Name']), url=row['URL'], photo=data_uri(photos[row['Last_Name']], 'image/jpg') if row['Last_Name'] in photos else None) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Author'][cid], ord=i + 1) for src in ds.sources.items(): for invalid in ['isbn', 'part', 'institution']: if invalid in src: del src[invalid] data.add( common.Source, src.id, id=src.id, name=src.get('author', src.get('editor')), description=src.get('title', src.get('booktitle')), bibtex_type=getattr(EntryType, src.genre, EntryType.misc), **src) re_links = re.compile(r'\[(?P<label>[^\]]+?)\]\((?P<type>.+?)-(?P<id>\d+)\)') link_map = { 'cog': '/cognatesets/', 'lex': '/values/', 'src': '/sources/', } def parse_links(m): try: return '<a href="{}{}">{}</a>'.format( link_map[m.group('type')], m.group('id'), m.group('label')) except KeyError: print("parse_links: type error in '{}'".format(":".join(m.groups()))) return '[{}]({}-{})'.format(m.group('label'), m.group('type'), m.group('id')) for param in ds['ParameterTable']: data.add( models.Meaning, param['ID'], id=slug(param['Name']), name=param['Name'], description_md=param['Description_md'], concepticon_id=int(param['Concepticon_ID']) if param['Concepticon_ID'] != '0' else None, ) for row in ds['clades.csv']: data.add( models.Clade, row['ID'], id=row['ID'], level0_name=row['level0_name'], level1_name=row['level1_name'], level2_name=row['level2_name'], level3_name=row['level3_name'], clade_level0=row['clade_level0'], clade_level1=row['clade_level1'], clade_level2=row['clade_level2'], clade_level3=row['clade_level3'], clade_name=row['clade_name'], short_name=row['short_name'], color=row['color'], ) for row in ds['LanguageTable']: c = data.add( common.Contribution, row['ID'], id=row['ID'], name=row['Name'], ) for i, cid in enumerate(row['Author_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['Author'][cid], ord=i + 1)) data.add( models.Variety, row['ID'], id=slug(row['Name']), name=row['Name'], latitude=float(row['Latitude']) if row['Latitude'] is not None else None, longitude=float(row['Longitude']) if row['Longitude'] is not None else None, contribution=c, color=rgb_as_hex(row['Color']), clade=', '.join(filter(None, row['Clade'])), clade_name=row['clade_name'], glottocode=row['Glottocode'], historical=row['historical'], distribution=row['distribution'], logNormalMean=row['logNormalMean'], logNormalOffset=row['logNormalOffset'], logNormalStDev=row['logNormalStDev'], normalMean=row['normalMean'], normalStDev=row['normalStDev'], ascii_name=row['ascii_name'], iso=row['ISO639P3code'], lang_description=row['Description'], variety=row['Variety'], loc_justification=row['loc_justification'] or None, sort_order=row['sort_order'] ) vsrs = set() for row in ds['FormTable']: vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID'])) if not vs: vs = data.add( common.ValueSet, (row['Language_ID'], row['Parameter_ID']), id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']), language=data['Variety'][row['Language_ID']], parameter=data['Meaning'][row['Parameter_ID']], contribution=data['Contribution'][row['Language_ID']], ) v = data.add( models.Lexeme, row['ID'], id=row['ID'], name=row['Form'], native_script=row['native_script'], phonetic=row['phon_form'], phonemic=row['Phonemic'], comment=re_links.sub(parse_links, row['Comment'] or ''), url=row['url'], gloss=row['Gloss'], valueset=vs ) for src in row['Source']: sid, pages = ds.sources.parse(src) key = (vs.id, sid, pages) if pages: pages = pages.replace('|', ';') if key not in vsrs: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][sid], description=pages)) vsrs.add(key) for row in ds['CognatesetTable']: cc = data.add( models.CognateClass, row['ID'], id=row['ID'], name=row['ID'], root_form=row['Root_Form_calc'] if row['Root_Form_calc'] is not None and len(row['Root_Form_calc']) else row['Root_Form'], root_form_calc=row['Root_Form_calc'] or None, root_gloss=row['Root_Gloss'] or None, root_language=row['Root_Language_calc'] if row['Root_Language_calc'] is not None and len(row['Root_Language_calc']) else row['Root_Language'], root_language_calc=row['Root_Language_calc'] or None, comment=re_links.sub(parse_links, row['Comment'] or ''), justification=re_links.sub(parse_links, row['Justification'] or ''), ideophonic=row['Ideophonic'] or None, parallel_derivation=row['parallelDerivation'] or None, revised_by=','.join(row['revised_by']) or None, superset_id=int(row['supersetid']) if row['supersetid'] else None, ) for src in row['Source']: sid, pages = ds.sources.parse(src) if pages: pages = pages.replace('|', ';') DBSession.add(clld_cognacy_plugin.models.CognatesetReference( cognateset=cc, source=data['Source'][sid], description=pages)) DBSession.flush() cc_id_pk_map = {str(ccid): cc.pk for ccid, cc in data['CognateClass'].items()} for row in ds['CognatesetTable']: if row['proposedAsCognateTo_pk']: DBSession.add(models.ProposedCognates( cc1_pk=data['CognateClass'][row['ID']].pk, cc2_pk=cc_id_pk_map[str(row['proposedAsCognateTo_pk'])], scale=row['proposedAsCognateToScale'] )) DBSession.flush() loans = {ln['Cognateset_ID']: ln for ln in ds['loans.csv']} for ccid, cc in data['CognateClass'].items(): if ccid in loans: le = loans[ccid] if le['SourceCognateset_ID']: cc.loan_source_pk = data['CognateClass'][le['SourceCognateset_ID']].pk else: cc.loan_source_pk = None cc.loan_notes = le['Comment'] cc.loan_source_languoid = le['Source_languoid'] cc.loan_source_form = le['Source_form'] cc.parallel_loan_event = le['Parallel_loan_event'] cc.is_loan = True for row in ds['CognateTable']: cc = data['CognateClass'][row['Cognateset_ID']] if cc.meaning_pk is None: cc.meaning_pk = data['Lexeme'][row['Form_ID']].valueset.parameter_pk else: assert data['Lexeme'][row['Form_ID']].valueset.parameter_pk == cc.meaning_pk data.add( clld_cognacy_plugin.models.Cognate, row['ID'], cognateset=data['CognateClass'][row['Cognateset_ID']], counterpart=data['Lexeme'][row['Form_ID']], doubt=row['Doubt'], ) l_by_gc = {} for s in DBSession.query(models.Variety): l_by_gc[s.glottocode] = s.pk tree = Phylogeny( id='1', name='Bouckaert et al.', description='', newick=Path.read_text(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree, description=taxon.glottocode) if taxon.glottocode in l_by_gc: LanguageTreeLabel(language_pk=l_by_gc[taxon.glottocode], treelabel=label) DBSession.add(tree) l_by_ascii = {} for s in DBSession.query(models.Variety): l_by_ascii[s.ascii_name] = s.pk tree = Phylogeny( id='2', name='CoBL consensu', description='', newick=Path.read_text(data_file_path / 'raw' / 'ie122' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'ie122' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree) if taxon.taxon in l_by_ascii: LanguageTreeLabel(language_pk=l_by_ascii[taxon.taxon], treelabel=label) DBSession.add(tree)
def main(args): def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes(data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace('/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict(url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files(object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def main(args): Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\ .create(DBSession.bind) data = Data() dataset = common.Dataset( id=numerals.__name__, name="Numeralbank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain="numerals.clld.org", jsondata={ "license_icon": "cc-by.png", "license_name": "Creative Commons Attribution 4.0 International License", }, ) DBSession.add(dataset) for i, (id_, name) in enumerate( [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")] ): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) # Take meta data from curated CLDF data set ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Parameters: for parameter in ds["ParameterTable"]: data.add( models.NumberParameter, parameter["ID"], id=parameter["ID"], name="{0}".format(parameter["ID"]), concepticon_id=parameter['Concepticon_ID'], ) basis_parameter = data.add( models.NumberParameter, "0", id="0", name="Base", ) load_family_langs = [] for language in ds["LanguageTable"]: lang = data.add( models.Variety, language["ID"], id=language["ID"], name=language["Name"], latitude=language["Latitude"], longitude=language["Longitude"], creator=language["Contributor"], comment=language["Comment"], url_soure_name=language["SourceFile"], ) if language["Glottocode"]: load_family_langs.append((language["Glottocode"], lang)) # get orginal forms ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json') org_forms = {f["ID"]: f for f in ds["FormTable"]} d = data_repos[1] contrib = data.add( common.Contribution, d['id'], id=d['id'], name=d['name'] ) # process curated forms ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Add Base info if given for language in ds["LanguageTable"]: if language["Base"]: basis = language["Base"] de = data["DomainElement"].get(basis) if not de: de = data.add( common.DomainElement, basis, id=text_type(basis), name=text_type(basis), parameter=basis_parameter, ) vs = data.add( common.ValueSet, data["Variety"][language["ID"]].id, id=data["Variety"][language["ID"]].id, language=data["Variety"][language["ID"]], parameter=basis_parameter, contribution=contrib, ) common.Value( id=data["Variety"][language["ID"]].id, valueset=vs, domainelement=de ) # Forms: for form in ds["FormTable"]: valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"]) valueset = data["ValueSet"].get(valueset_id) # Unless we already have something in the VS: if not valueset: if form["Language_ID"] in data["Variety"]: vs = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data["Variety"][form["Language_ID"]], parameter=data["NumberParameter"][form["Parameter_ID"]], contribution=contrib, ) org_form = "" if form["ID"] in org_forms: if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]: org_form = org_forms[form["ID"]]["Form"] else: org_form = "no original form" DBSession.add( models.NumberLexeme( id=form["ID"], name=form["Form"], comment=form["Comment"], is_loan=form["Loan"], other_form=form["Other_Form"], org_form=org_form, is_problematic=form["Problematic"], valueset=vs, ) ) load_families( Data(), load_family_langs, glottolog_repos=gl_repos, strict=False, ) distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all() families = dict( zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties))) ) for l in DBSession.query(models.Variety): l.jsondata = {"color": families[l.family_pk]} p = common.Parameter.get("0") colors = color.qualitative_colors(len(p.domain)) for i, de in enumerate(p.domain): de.jsondata = {"color": colors[i]}
def load_ecoregions(filter=None): """ :param data: :param filter: :return: """ ecoregions = jsonlib.load( pathlib.Path(pytsammalex.__file__).parent / 'ecoregions.json')['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } data = Data() for eco_code, features in itertools.groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if filter and not filter(eco_code, props): continue if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add(Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add(Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def main(args): # pragma: no cover data = Data() clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data' clts_repos = CLTS(clts_repos) print(clts_repos.repos) version = 'v2.1.0' # assert_release(clts_repos.repos) for rec in Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id='clts', name="CLTS {0}".format(version), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate([ 'Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Robert Forkel', ]): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for line in args.cldf['data/features.tsv']: data.add( models.Feature, line['ID'], id=line['ID'], name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']), sound_type=line['TYPE'], feature=line['FEATURE'], value=line['VALUE'], ) DBSession.add(models.SoundSegment( id='NA', name='<NA>', description='<NA>', type='marker', generated=True, unicode='', color='#bbbbbb', )) for line in args.cldf['data/sounds.tsv']: s = data.add( models.SoundSegment, line['ID'], id=line['ID'], name=line['GRAPHEME'], description=line['NAME'], type=line['TYPE'], generated=line['GENERATED'], unicode=' / '.join(line['UNICODE']), color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']), ) if s.color == '0': s.color = '#bbbbbb' assert s.color in LEGEND DBSession.flush() seen = set() for line in args.cldf['data/sounds.tsv']: for fid in line['FEATURES']: spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk if (spk, fpk) not in seen: DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk)) seen.add((spk, fpk)) english = data.add( common.Language, 'eng', id='eng', name='English') for line in args.cldf['sources/index.tsv']: c = data.add( models.Transcription, line['NAME'], id=line['NAME'], name=line['NAME'], description=line['DESCRIPTION'].replace(':bib:', '/sources/'), datatype=getattr(models.Datatype, line['TYPE']) ) for ref in line.get('REFS', []): common.ContributionReference(source=data['Source'][ref], contribution=c) sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl for line in args.cldf['data/graphemes.tsv']: key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME'] if key not in data['Grapheme']: sound_id = line['NAME'].replace(' ', '_') vs = data['ValueSet'].get((line['DATASET'], line['NAME'])) if not vs: try: vs = data.add( common.ValueSet, (line['DATASET'], line['NAME']), id=key, description=line['NAME'], language=english, contribution=data['Transcription'][line['DATASET']], parameter=data['SoundSegment'][sound_id] ) except: print(line) raise data.add( models.Grapheme, key, id=key, name=line['GRAPHEME'], description=line['NAME'], url=line['URL'].unsplit() if line['URL'] else None, audio=sound_url_template.expand(line) if line['SOUND'] else None, image=image_url_template.expand(line) if line['IMAGE'] else None, valueset=vs )
def main(args): data = Data() print(args.data_file('x')) dataset = common.Dataset( id=grammaticon.__name__, name="Grammaticon", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grammaticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, ed in enumerate(['Martin Haspelmath', 'Robert Forkel']): common.Editor(dataset=dataset, contributor=get_contributor(data, ed), ord=i + 1) eng = data.add(common.Language, 'eng', name='English') for obj in reader(args.data_file('Feature_lists.csv'), dicts=True): contrib = data.add( models.Featurelist, obj['id'], id=slug(obj['name']), name=obj['name'], year=obj['year'], number_of_features=int(obj['number of features']) if obj['number of features'] else None, url=obj['year']) if obj['authors']: for i, author in enumerate(obj['authors'].split(',')): common.ContributionContributor( contribution=contrib, contributor=get_contributor(data, author), ord=i + 1) #id,name,feature_area for name, objs in itertools.groupby( sorted(reader(args.data_file('Metafeatures.csv'), dicts=True), key=lambda i: i['name']), lambda i: i['name']): dbobj = None for obj in objs: if not dbobj: dbobj = data.add( models.Metafeature, obj['id'], id=slug(obj['id']), name=obj['name'], area=obj['feature_area']) else: data['Metafeature'][obj['id']] = dbobj DBSession.flush() #feature_ID,feature name,feature description,meta_feature_id,collection_id,collection URL,collection numbers for obj in reader(args.data_file('Features.csv'), dicts=True): if int(obj['collection_id']) == 8: obj['collection_id'] = '1' if (not obj['meta_feature_id']): #or obj['meta_feature_id'] in ('89'): print('skipping: {}'.format(obj)) continue vsid = (data['Featurelist'][obj['collection_id']].pk, data['Metafeature'][obj['meta_feature_id']].pk) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='{0}-{1}'.format(*vsid), contribution=data['Featurelist'][obj['collection_id']], parameter=data['Metafeature'][obj['meta_feature_id']], language=eng) models.Feature( valueset=vs, id=slug(obj['feature_ID']), name=obj['feature name'], description=obj['feature description']) for obj in reader(args.data_file('Concepts.csv'), dicts=True): data.add( models.Concept, obj['id'], id=obj.pop('id'), name=obj.pop('label'), description=obj.pop('definition'), **{k.replace(' ', '_'): v for k, v in obj.items()}) for obj in reader(args.data_file('Concepts_metafeatures.csv'), dicts=True): if obj['meta_feature__id'] in ('89',): print('skipping: {}'.format(obj)) continue if obj['concept_id'] and obj['meta_feature__id']: models.ConceptMetafeature( concept=data['Concept'][obj['concept_id']], metafeature=data['Metafeature'][obj['meta_feature__id']]) for obj in reader(args.data_file('Concepthierarchy.csv'), dicts=True): child = data['Concept'].get(obj['concept_id']) if child: parent = data['Concept'].get(obj['concept_parent_id']) if parent: DBSession.add(models.ConceptRelation(parent=parent, child=child))
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, jambu.__name__, id=jambu.__name__, name='Jambu', domain='jambu-clld.herokuapp.com', publisher_name="Georgetown University", publisher_place="Washington", publisher_url="http://gucl.georgetown.edu/", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate(['Aryaman Arora']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) print("Languages...") for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'glottocode', 'longitude', 'latitude', 'Clade'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], family=lang['Clade'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) print("Cognates...") for cognate in iteritems(args.cldf, 'CognateTable'): # print(cognate) data.add(models.Cognate_, cognate['Cognateset_ID'], name=cognate['Form'], language=cognate['Language_ID'], description=cognate['Description']) counts = collections.defaultdict(set) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): counts[form['parameterReference']].add(form['languageReference']) print("Params...") for param in tqdm( iteritems(args.cldf, 'ParameterTable', 'ID', 'Name', 'Concepticon_ID', 'Description')): data.add(models.Concept, param['ID'], id=param['ID'], name='{} [{}]'.format(param['Name'], param['ID']), description=param['Description'], count=len(counts[param['ID']])) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): l = re.split(r";|\+", form['parameterReference']) for i, paramref in enumerate(l): if paramref == '?': continue vsid = (form['languageReference'], paramref) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][paramref], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Lexeme, form['id'] + '-' + str(i) if len(l) > 1 else form['id'], id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'], name=form['form'], gloss=form['Gloss'], native=form['Native'], phonemic='/' + form['Phonemic'] + '/' if form['Phonemic'] else None, description=form['Description'], cognateset=form['Cognateset'], valueset=vs, ) print("Refs...") for (vsid, sid), pages in tqdm(refs.items()): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): data = Data() ds = Pofatu( pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data') dataset = common.Dataset( id=pofatu.__name__, name="POFATU", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='pofatu.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('hermannaymeric', 'Aymeric Hermann'), ('forkelrobert', 'Robert Forkel'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for rec in ds.iterbib(): rec.genre = bibtex.EntryType.from_string( ENTRY_TYPES.get(rec.genre, rec.genre)) if 'date' in rec: rec['year'] = rec.pop('date') data.add(common.Source, rec.id, _obj=bibtex2source(rec, lowercase_id=False)) analyses = list(ds.iterdata()) def midpoint(coords): p = MultiPoint([(lat, lon + 360 if lon < 0 else lon) for lat, lon in coords]).convex_hull #geojson = { # 'type': 'Feature', # 'properties': {}, # 'geometry': mapping(p)} c = p.centroid return c.x, (c.y - 360) if c.y > 180 else c.y artefacts = collections.defaultdict(dict) midpoints = {} for a in analyses: l = a.sample.location lid = l.id if lid not in midpoints: midpoints[lid] = set() if l.latitude is not None and l.longitude is not None: midpoints[lid].add((l.latitude, l.longitude)) art = a.sample.artefact for attr_ in ['name', 'category', 'collection_type']: if not artefacts[slug(art.id)].get(attr_): artefacts[slug(art.id)][attr_] = getattr(art, attr_) midpoints = { k: midpoint(v) if v else (None, None) for k, v in midpoints.items() } for analysis in analyses: loc = analysis.sample.location if loc.id not in data['Location']: data.add( models.Location, loc.id, id=valid_id(loc.id), name=loc.label, latitude=midpoints[loc.id][0], longitude=midpoints[loc.id][1], region=loc.region.replace('_', ' '), subregion=loc.subregion, location=loc.locality, ) # Add contributions for contrib in ds.itercontributions(): contribution = data.add( common.Contribution, contrib.id, id=valid_id(contrib.id), name=contrib.label, description=contrib.description, ) DBSession.flush() for i, name in enumerate(contrib.contributors): cid = slug(name) co = data['Contributor'].get(cid) if not co: co = data.add(common.Contributor, cid, id=cid, name=name) common.ContributionContributor(ord=i, contribution=contribution, contributor=co) for ref in contrib.source_ids: DBSession.add( common.ContributionReference( contribution=contribution, source=data['Source'][ref], )) data['Contribution'][ref] = contribution methods = collections.defaultdict(list) for method in ds.itermethods(): m = data.add( models.Method, method.id, id=valid_id(method.id), name=method.label, code=method.code, parameter=method.parameter.strip(), instrument=method.instrument, number_of_replicates=method.number_of_replicates, date=method.date, comment=method.comment, detection_limit=method.detection_limit, detection_limit_unit=method.detection_limit_unit, total_procedural_blank_value=method.total_procedural_blank_value, total_procedural_unit=method.total_procedural_unit, ) methods[(m.code.lower(), m.parameter.lower())].append(m) for ref in method.references: DBSession.add( models.MethodReference( method=m, sample_name=ref.sample_name, sample_measured_value=ref.sample_measured_value, uncertainty=ref.uncertainty, uncertainty_unit=ref.uncertainty_unit, number_of_measurements=ref.number_of_measurements, )) for ref in method.normalizations: DBSession.add( models.Normalization( method=m, reference_sample_name=ref.reference_sample_name, reference_sample_accepted_value=ref. reference_sample_accepted_value, citation=ref.citation, )) parameter = data.add(common.Parameter, 'c', id='category', name='Sample category') for i, opt in enumerate(attr.fields_dict( pypofatu.models.Sample)['sample_category'].validator.options, start=1): data.add(common.DomainElement, opt, parameter=parameter, id=str(i), name=opt) DBSession.flush() assert parameter.pk # Add Samples and UnitParameters and Measurements for analysis in analyses: sample = analysis.sample vsid = '{0}-{1}'.format(sample.location.id, data['Contribution'][sample.source_id].id) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=valid_id(vsid), language_pk=data['Location'][sample.location.id].pk, parameter_pk=parameter.pk, contribution_pk=data['Contribution'][sample.source_id].pk, ) v = data['Sample'].get(sample.id) if not v: v = data.add( models.Sample, sample.id, id=valid_id(sample.id), name=sample.id, sample_name=sample.sample_name, sample_comment=sample.sample_comment, petrography=sample.petrography, latitude=sample.location.latitude, longitude=sample.location.longitude, elevation=sample.location.elevation, location_comment=sample.location.comment, site_name=sample.site.name, site_code=sample.site.code, site_context=sample.site.context, site_comment=sample.site.comment, site_stratigraphic_position=sample.site.stratigraphic_position, site_stratigraphy_comment=sample.site.stratigraphy_comment, domainelement=data['DomainElement'][sample.sample_category], valueset=vs, artefact_id=sample.artefact.id, artefact_name=sample.artefact.name, artefact_category=sample.artefact.category, artefact_comment=sample.artefact.comment, artefact_attributes=sample.artefact.attributes, artefact_collector=sample.artefact.collector, artefact_collection_type=sample.artefact.collection_type, artefact_collection_location=sample.artefact. collection_location, artefact_collection_comment=sample.artefact.collection_comment, artefact_fieldwork_date=sample.artefact.fieldwork_date, ) DBSession.add( models.SampleReference( description='sample', sample=v, source=data['Source'][sample.source_id])) for ref in sample.artefact.source_ids: DBSession.add( models.SampleReference(description='artefact', sample=v, source=data['Source'][ref])) for ref in sample.site.source_ids: DBSession.add( models.SampleReference(description='site', sample=v, source=data['Source'][ref])) a = data.add( models.Analysis, analysis.id, id=better_slug(analysis.id), name=analysis.id, sample=v, ) for i, measurement in enumerate(analysis.measurements): if i == 0: method = measurement.method if method: a.analyzed_material_1 = method.analyzed_material_1, a.analyzed_material_2 = method.analyzed_material_2, a.sample_preparation = method.sample_preparation, a.chemical_treatment = method.chemical_treatment, a.technique = method.technique, a.laboratory = method.laboratory, a.analyst = method.analyst, pid = slug(measurement.parameter, lowercase=False) p = data['Param'].get(pid) if not p: p = data.add(models.Param, pid, id=pid, name=measurement.parameter) data.add( models.Measurement, None, id='{0}-{1}'.format(a.id, p.id), analysis=a, method=data['Method'].get(measurement.method.id) if measurement.method else None, value=measurement.value, less=measurement.less, precision=measurement.value_sd, sigma=measurement.sd_sigma, unitparameter=p, )
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )