def main(args): # pragma: no cover data = Data() print("Setting up dataset…") dataset = common.Dataset( id=cariban.__name__, domain="cariban.clld.org", name="Comparative Cariban Database", description="Comparative Cariban Database", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_url="https://www.eva.mpg.de", publisher_place="Leipzig", license="https://creativecommons.org/licenses/by/4.0/", contact="*****@*****.**", jsondata={'function_paradigms': []}, ) fps = [] for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: for cons in morph_func["Construction"]: fps.append({ 'Function': function, 'Construction': cons, 'Morpheme': morph_func['Morpheme']}) dataset.update_jsondata(function_paradigms=fps) DBSession.add(dataset) DBSession.flush() print("Adding contributors…") c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/") dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True)) print("Adding languages…") dialect_mapping = {} lang_shorthands = {} glottocodes = {} lang_ids = {} for lang in args.cldf["LanguageTable"]: if lang["Sampled"] == "y": language = data.add( common.Language, lang["ID"], id=lang["ID"], name=lang["Name"], latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None, longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None, jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']}, ) add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"]) if lang["Dialect_Of"] not in [None, "y"]: dialect_mapping[lang["ID"]] = lang["Dialect_Of"] lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]} glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} def get_lang_id(key): if key in lang_shorthands: lang_id = lang_shorthands[key]["ID"] elif key in glottocodes: lang_id = glottocodes[key]["ID"] elif key in lang_ids: lang_id = key else: print("Could not identify language %s" % key) return None if lang_id in dialect_mapping: lang_id = dialect_mapping[lang_id] return lang_id def get_key_and_page(source_string): if len(source_string.split("[")) > 1: bib_key = source_string.split("[")[0] pages = source_string.split("[")[1].split("]")[0] else: bib_key = source_string pages = "" return bib_key, pages print("Adding sources…") for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) print("Adding language sources…") DBSession.flush() for rec in bibtex.Database.from_file(args.cldf.bibpath): if "keywords" in rec: for keyword in rec["keywords"].split(","): if keyword in lang_shorthands: lang_id = get_lang_id(keyword.strip(" ")) if lang_id in data["Language"]: data.add(common.LanguageSource, rec.id+lang_id, language_pk=data["Language"][lang_id].pk, source_pk=data["Source"][rec.id].pk ) data.add( common.Source, "pc", id="pc", name="Personal communication", description="Placeholder for data obtained from personal communication.", bibtex_type=bibtex.EntryType.misc ) # print("Adding glossing abbreviations…") # length = len(pynterlinear.get_all_abbrevs().keys()) # for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()): # print("%s/%s" % (i+1, length), end="\r") # DBSession.add(common.GlossAbbreviation(id=key, name=name)) # print("") # print("Adding examples…") gloss_replacements = { "S_A_": "Sa", "S_P_": "Sp" } def clldify_glosses(gloss_line): for orig, new in gloss_replacements.items(): gloss_line = gloss_line.replace(orig,new) gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line) return gloss_line for ex in args.cldf["ExampleTable"]: lang_id = get_lang_id(ex["Language_ID"]) new_ex = data.add(common.Sentence, ex["ID"], id=ex["ID"], name=ex["Name"], description=ex["Translated_Text"], analyzed="\t".join(ex["Analyzed_Word"]), gloss=clldify_glosses("\t".join(ex["Gloss"])), language=data["Language"][lang_id], comment=ex["Comment"], markup_gloss="\t".join(ex["Morpheme_IDs"]) ) if ex["Source"]: bib_key, pages = get_key_and_page(ex["Source"]) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(common.SentenceReference( sentence=new_ex, source=source, key=source.id, description=pages.replace("--","–")) ) def add_morpheme_reference(morpheme, source_string): bib_key, pages = get_key_and_page(source_string) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.MorphemeReference( morpheme=morpheme, source=source, key=source.id, description=pages.replace("--","–") ) ) print("Adding morphemes…") for morph in args.cldf["FormTable"]: lang_id = get_lang_id(morph["Language_ID"]) form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ") new_morph = data.add(models.Morpheme, morph["ID"], morpheme_type="grammatical", language=data["Language"][lang_id], name="/".join(form), id=morph["ID"], ) if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0]) print("Adding constructions…") data.add(models.DeclarativeType, "imp", id="imp", name="imperative") data.add(models.DeclarativeType, "decl", id="decl", name="declarative") data.add(models.MainClauseVerb, "y", id="y", name="main clause construction") data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction") for cons in args.cldf["ParameterTable"]: lang_id = get_lang_id(cons["Language_ID"]) new_construction = data.add( models.Construction, cons["ID"], id=cons["ID"], language=data["Language"][lang_id], name=cons["Description"], mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]], ) if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]] def add_morph_func(morpheme, func_key, construction): data.add(models.MorphemeFunction, "%s:%s" % (morpheme, function), id="%s:%s" % (morpheme, func_key), name="MorphemeFunction %s:%s"% (morpheme, func_key), unit=data["Morpheme"][morpheme], unitparameter=data["Meaning"][function], construction=construction ) print("Adding morpheme functions…") for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: func_key = function.replace(".","_") if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"): meaning_type="inflectional" else: meaning_type="derivational" if function not in data["Meaning"]: data.add(models.Meaning, function, id=func_key, name=function, meaning_type=meaning_type ) #Only some morpheme functions are specified as occurring in specific constructions if len(morph_func["Construction"]) == 0: for morpheme in morph_func["Morpheme"]: add_morph_func(morpheme, func_key, None) else: for construction in morph_func["Construction"]: if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?": for morpheme in morph_func["Morpheme"]: if data["Morpheme"][morpheme].language != data["Construction"][construction].language: print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % ( data["Morpheme"][morpheme].language, data["Morpheme"][morpheme], data["Construction"][construction].language, data["Construction"][construction] ) ) cons_func_key = func_key + ":" + construction add_morph_func(morpheme, cons_func_key, data["Construction"][construction]) print("Checking examples for illustrated morphemes…") proto_languages = ["pc"] is_illustrated = {} for key, row in data["MorphemeFunction"].items(): if row.unit.language.id in proto_languages: continue is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False for row in args.cldf["ExampleTable"]: for word in row["Morpheme_IDs"]: morph_ids = util.split_word(word) for unit_value in morph_ids: if unit_value in ["X","-","=", "~"]: continue unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"]) if unitvaluesentence_key in data["UnitValueSentence"].keys(): continue is_illustrated[unit_value] = True morph_id = unit_value.split(":")[0] if morph_id not in data["Morpheme"].keys(): print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id)) elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language: print("Warning: The %s example %s claims to contain the %s morpheme %s." % ( data["Sentence"][row["ID"]].language, row["ID"], data["Morpheme"][morph_id].language, data["Morpheme"][morph_id] ) ) if ":" not in unit_value: print("%s in %s contains no defined function!" % (unit_value, row["ID"])) function = unit_value.split(":")[1] morph_function_id = "%s:%s" % (morph_id, function) if morph_function_id not in data["MorphemeFunction"].keys(): print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-"))) continue data.add(models.UnitValueSentence, unitvaluesentence_key, sentence=data["Sentence"][row["ID"]], unitvalue=data["MorphemeFunction"][morph_function_id], ) # see how many morpheme functions are illustrated with example sentences good_ill = [key for key, value in is_illustrated.items() if value] not_ill = [key for key, value in is_illustrated.items() if not value] not_ill.sort() cov = len(good_ill)/len(is_illustrated)*100 print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2))) f = open("../unillustrated_morphemes.txt", "w") for morph in not_ill: f.write(morph+"\n") f.close() print("Adding cognate sets…") for cogset in args.cldf["CognatesetTable"]: new_cset = data.add(models.Cognateset, cogset["ID"], id=cogset["ID"], name=cogset["Name"], description=cogset["Function"], cogset_type="grammatical" ) if cogset["Source"]: for source in cogset["Source"]: bib_key, pages = get_key_and_page(source) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=new_cset, source=source, key=source.id, description=pages) ) print("Adding cognates…") for morph in args.cldf["FormTable"]: for cognate_ID in morph["Cognateset_ID"]: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=data["Morpheme"][morph["ID"]] ) ) print("Adding morpheme comments…") for row in args.cldf["FormTable"]: data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"]) print("Adding construction descriptions…") for cons in args.cldf["ParameterTable"]: if cons["Comment"] is None: description = "" else: description = util.generate_markup(cons["Comment"]) description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"])) description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"])) data["Construction"][cons["ID"]].markup_description = description print("Adding cognate set descriptions…") for cogset in args.cldf["CognatesetTable"]: data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"]) # if cogset["ID"] == "13pro": # data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup( # util.comparative_function_paradigm( # ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"], # "1+3 scenarios", # ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"])) def add_tree_labels(phylo): uncertain_nodes = [] for node in phylo.find_clades(): if node.name == None or not node.is_terminal(): continue plain_name = node.name.replace("?","") if "?" in node.name: uncertain_nodes.append(plain_name) if plain_name in lang_ids: node.name = lang_ids[plain_name]["Name"].replace("'", "’") if plain_name in uncertain_nodes: node.name += "?" return phylo, uncertain_nodes print("Adding trees…") own_trees = ["matter"] tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw') newick_files = {} for tree in args.cldf["cariban_trees.csv"]: if tree["ID"] in own_trees: continue newick_files[tree["ID"]] = { "orig": tree["ID"]+"_orig.newick", "norm": tree["ID"]+"_norm.newick", "source": tree["Source"], "comment": tree["Comment"], "o_comment": tree["Orig_Comment"] } #adding my own trees separately. for my_tree_count, tree_id in enumerate(own_trees): my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick") my_tree, uncertain_nodes = add_tree_labels(my_tree) edited_tree = io.StringIO() Phylo.write(my_tree, edited_tree, "newick") tree = edited_tree.getvalue().replace(":0.00000","") my_phylo = Phylogeny( tree_id, id=tree_id, name="Matter (2020)",# % str(my_tree_count+1), newick=tree, markup_description="My own, conservative, classification." ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=my_phylo ) ) DBSession.add(my_phylo) #adding the other trees for tree_id, values in newick_files.items(): norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick") orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick") norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree) edited_tree = io.StringIO() Phylo.write(norm_biotree, edited_tree, "newick") norm_tree = edited_tree.getvalue().replace(":0.00000","") edited_tree = io.StringIO() Phylo.write(orig_biotree, edited_tree, "newick") orig_tree = edited_tree.getvalue().replace(":0.00000","") norm_phylo = Phylogeny( id=tree_id+"_norm", name=str(data["Source"][values["source"]]) + " (Normalized)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] ), newick=norm_tree ) if values["o_comment"] == None: o_comment = "" else: o_comment = values["o_comment"] orig_phylo = Phylogeny( id=tree_id+"_orig", name=str(data["Source"][values["source"]]) + " (Original)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] + " " + o_comment ), newick=orig_tree ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=norm_phylo ) ) DBSession.add(norm_phylo) DBSession.add(orig_phylo) print("Adding t-adding verb cognate sets…") for t_verb_set in args.cldf["cariban_t_cognates.csv"]: cognate_ID = "t"+t_verb_set["ID"] rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"]) t_cogset = data.add(models.Cognateset, cognate_ID, id=cognate_ID, name=rec_t_form, description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"], cogset_type="t_adding" ) if t_verb_set["Source"]: bib_key = t_verb_set["Source"].split("[")[0] if len(t_verb_set["Source"].split("[")) > 1: pages = t_verb_set["Source"].split("[")[1].split("]")[0] else: pages = " " if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=t_cogset, source=source, key=source.id, description=pages) ) print("Adding t-adding verbs…") t_langs = {} t_verbs = {} non_t_adding_lgs = ["ing","mac","kar","wmr","pan"] data.add(models.Meaning, "t_verb", id="t-verb", name="t-adding verb", ) for t_verb_entry in args.cldf["cariban_t_verbs.csv"]: if t_verb_entry["Language_ID"] == "cari1283": continue cognate_ID = "t"+t_verb_entry["Cognateset_ID"] lang_id = get_lang_id(t_verb_entry["Language_ID"]) morph_id = lang_id+"_"+cognate_ID if morph_id in data["Morpheme"].keys(): if morph_id + "_2" in data["Morpheme"].keys(): morph_id += "_3" else: morph_id += "_2" t_verb = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="t_adding", name=t_verb_entry["Form"], language=data["Language"][lang_id], ) DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=t_verb ) ) if t_verb_entry["t"] == "y": t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name) t_verb.markup_description = util.generate_markup("Shows cogset:t") if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs: t_verb.name = "[t-?]"+t_verb.name t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t") if t_verb_entry["t"] == "n": t_verb.markup_description = util.generate_markup("Does not show cogset:t") if lang_id not in t_langs.keys(): t_langs[lang_id] = {"y": 0, "n": 0, "?": 0} if cognate_ID not in t_verbs.keys(): t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0} t_langs[lang_id][t_verb_entry["t"]] += 1 if lang_id not in non_t_adding_lgs: t_verbs[cognate_ID][t_verb_entry["t"]] += 1 if t_verb_entry["Source"]: add_morpheme_reference(t_verb, t_verb_entry["Source"]) data.add(models.MorphemeFunction, "t_"+t_verb_entry["ID"], id="t_"+t_verb_entry["ID"], name="t-Verb %s" % t_verb_entry["Parameter_ID"], unit=t_verb, unitparameter=data["Meaning"]["t_verb"], construction=None ) for lang, values in t_langs.items(): data["Language"][lang].update_jsondata(t_values=values) for verb, values in t_verbs.items(): # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"])) data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))) print("Adding reconstructed lexemes…") proto_forms = {} for cogset in args.cldf["cariban_lexical_reconstructions.csv"]: proto_forms[cogset["ID"]] = cogset["Form"] first_found = [] for entry in args.cldf["cariban_swadesh_list.csv"]: cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"] if cognateset_ID not in data["Cognateset"]: if cognateset_ID in proto_forms: form = "*" + proto_forms[cognateset_ID].replace("; ", " / ") # else: # form = "" data.add(models.Cognateset, cognateset_ID, id=cognateset_ID, name=form, description=cognateset_ID, cogset_type="lexical" ) lang_id = get_lang_id(entry["Language_ID"]) if lang_id not in data["Language"]: continue function = entry["Parameter_ID"].replace(".","_") morph_id = entry["Language_ID"] + "_" + function if morph_id in first_found: continue first_found.append(morph_id) if function not in data["Meaning"].keys(): data.add(models.Meaning, function, id=function, name=function, meaning_type="lexical" ) morpheme = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="lexical", name=entry["Value"][0], language=data["Language"][lang_id], ) data.add(models.MorphemeFunction, "%s:%s" % (morph_id, function), id="%s:%s" % (morph_id, function), name="MorphemeFunction %s:%s"% (morph_id, function), unit=data["Morpheme"][morph_id], unitparameter=data["Meaning"][function], construction=None ) if entry["Source"]: add_morpheme_reference(morpheme, entry["Source"]) if cognateset_ID in proto_forms: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognateset_ID], counterpart=morpheme ) )
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def load(): wals = create_engine('postgresql://robert@/wals3') contributor = common.Contributor(id='gastvolker', name='Volker Gast') contribution = common.Contribution( id='tdir', name='Typological Database of Intensifiers and Reflexives') cc = common.ContributionContributor( contribution=contribution, contributor=contributor) DBSession.add(cc) for row in read('glosses'): DBSession.add(common.GlossAbbreviation(id=row['gloss'], name=row['explanation'])) params = {} for id_, name in PARAMS.items(): params[id_] = common.Parameter(id='tdir-' + id_, name=name) DBSession.add(params[id_]) # # TODO: domain for sortal restrictions! # values = {} languages = {} for row in read('languages'): if row['adn'] and '<br>' in row['adn']: row['adn'], other = row['adn'].split('<br>', 1) if not row['otherint']: row['otherint'] = '' row['otherint'] = '\n'.join(filter(None, row['otherint'].split('<br>') + other.split('<br>'))) row['sil'] = row['sil'].lower() row['sil'] = { 'arm': 'hye', 'vmn': 'mig', 'gli': 'gle', 'grk': 'ell', 'hbr': 'heb', 'ltn': 'lat', 'chn': 'cmn', 'ota': 'ote', 'pnj': 'pan', 'pba': 'rap', 'esg': 'kal', 'vla': 'zea', 'lat': 'lav', }.get(row['sil'], row['sil']) l = common.Language(id=row['sil'].lower(), name=row['language']) languages[row['language']] = l res = wals.execute("select l.latitude, l.longitude from language as l, languageidentifier as li, identifier as i where l.pk = li.language_pk and li.identifier_pk = i.pk and i.id = '%s' and i.type = 'iso639-3';" \ % row['sil']).fetchone() if not res: res = wals.execute("select latitude, longitude from language where name = '%s';" % row['language']).fetchone() if res: l.latitude, l.longitude = res else: print(row['language'], row['sil']) #(u'Classical Nahuatl', u'nci') ??? #(u'Ancient Greek', u'gko') for pid in params.keys(): value = row[pid] if value: value = common.Value( id='tdir-%s-%s' % (pid, l.id), name=unicode(bs(value)), contribution=contribution, parameter=params[pid], language=l) values['%s-%s' % (pid, row['language'])] = value DBSession.add(value) def normalize_ref(ref): ref = re.sub('\s+', ' ', ref).strip() return unicode(bs(ref)).replace('<i>', '"').replace('</i>', '"') """ Ogawa, A. (1998) Wali, K. et al. (2000) Lyutikova. -> Lyutikova, se-Bertit -> se-Berit missing refs: Sengupta, G. (2000). Lexical anaphors and pronouns in Bangla. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter. Davison, A. Mistry (2000). Lexical anaphors and pronouns in Hindi/Urdu. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter. """ refs = {} for row in read('references'): name = re.sub('\s+', ' ', row['entry'].split(').')[0].strip()) + ')' src = common.Source( id=row['ref'].strip(), name=name, description=normalize_ref(row['entry'])) refs[name] = src DBSession.add(src) for row in read('examples'): if row['language'] not in languages: print('example for unknown language "%s"' % row['language']) continue s = common.Sentence( id=row['Nr'].strip(), name=fix_example(row['original'], repl=' '), language=languages[row['language']], analyzed=fix_example(row['original']), gloss=fix_example(row['gloss']), description=row['translation'], source=row['source'], comment=row['comments']) has_refs = False for ref in refs: if ref in row['source']: if normalize_ref(row['source']) != refs[ref].description: print('-->') print(row['source']) has_refs = True common.SentenceReference(sentence=s, source=refs[ref]) if not has_refs: print('+++++') print(row['source']) pid = EXAMPLE_MAP[row['pov']] if pid: # associate with value! o = common.ValueSentence(value=values['%s-%s' % (pid, row['language'])], sentence=s) DBSession.add(s)
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def main(args): data = Data() doi = input('DOI of the released dataset: ') dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date.today(), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'doi': doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) ed_pattern = re.compile('ed(?P<ord>[0-9]+)$') for c in args.cldf['contributors.csv']: contrib = data.add( models.WaveContributor, c['ID'], id=c['ID'], name=c['Name'], email=c['Email'], url=c['URL'], address=c['Address'], sortkey=HumanName(c['Name']).last, ) m = ed_pattern.match(c['ID']) if m: common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord'))) for fc in args.cldf['featurecategories.csv']: data.add( models.FeatureCategory, fc['ID'], id=fc['ID'], name=fc['Name'], description=fc['Description']) for vt in args.cldf['varietytypes.csv']: data.add( models.VarietyType, vt['ID'], id=vt['ID'], name=vt['Name'], description=vt['Description'], jsondata=VARIETY_TYPE_ICONS[vt['ID']], ) for vt in args.cldf['regions.csv']: data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name']) for lang in args.cldf['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], abbr=lang['abbr'], region=data['Region'][lang['Region_ID']], type=data['VarietyType'][lang['Type_ID']], ) if lang['Glottocode']: add_language_codes(data, l, None, glottocode=lang['Glottocode']) c = data.add( models.WaveContribution, lang['ID'], id=str(lang['ID']), name=lang['Name'], description=lang['Description'], variety=l) for i, cid in enumerate(lang['Contributor_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['WaveContributor'][cid], ord=i+1, )) for param in args.cldf['ParameterTable']: data.add( models.Feature, param['ID'], id=param['ID'], category=data['FeatureCategory'][param['Category_ID']], name=param['Name'], description=param['Description'], jsondata={'example_source': param['Example_Source']}) for de in args.cldf['CodeTable']: data.add( common.DomainElement, de['ID'], id=de['ID'], parameter=data['Feature'][de['Parameter_ID']], name=de['Name'], description=de['Description'], jsondata={'color': CODE_COLORS[de['Name']]}, number=de['Number']) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for example in args.cldf['ExampleTable']: s = data.add( common.Sentence, example['ID'], id=example['ID'], name=example['Primary_Text'], gloss='\t'.join(example['Gloss']) if example['Gloss'] else None, comment=example['Comment'] or None, description=example['Translated_Text'] or None, language=data['Variety'][example['Language_ID']]) for ref in example['Source']: sid, pages = Sources.parse(ref) DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][sid], description=pages, key=sid)) for value in args.cldf['ValueTable']: de = data['DomainElement'][value['Code_ID']] vs = data.add( common.ValueSet, value['ID'], id=value['ID'], contribution=data['WaveContribution'][value['Language_ID']], parameter=data['Feature'][value['Parameter_ID']], jsondata=de.jsondata, language=data['Variety'][value['Language_ID']]) v = data.add( common.Value, value['ID'], id=value['ID'], domainelement=de, valueset=vs) for eid in value['Example_ID']: DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))