Example #1
0
def main(args):  # pragma: no cover
    data = Data()

    print("Setting up dataset…")
    dataset = common.Dataset(
        id=cariban.__name__,
        domain="cariban.clld.org",
        name="Comparative Cariban Database",
        description="Comparative Cariban Database",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_url="https://www.eva.mpg.de",
        publisher_place="Leipzig",
        license="https://creativecommons.org/licenses/by/4.0/",
        contact="*****@*****.**",
        jsondata={'function_paradigms': []},
    )

    fps = []
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            for cons in morph_func["Construction"]:
                fps.append({
                    'Function': function,
                    'Construction': cons,
                    'Morpheme': morph_func['Morpheme']})
    dataset.update_jsondata(function_paradigms=fps)

    DBSession.add(dataset)
    DBSession.flush()

    print("Adding contributors…")
    c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/")
    dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True))

    print("Adding languages…")
    dialect_mapping = {}
    lang_shorthands = {}
    glottocodes = {}
    lang_ids = {}
    for lang in args.cldf["LanguageTable"]:
        if lang["Sampled"] == "y":
            language = data.add(
                common.Language,
                lang["ID"],
                id=lang["ID"],
                name=lang["Name"],
                latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None,
                longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None,
                jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']},
            )
            add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"])
        if lang["Dialect_Of"] not in [None, "y"]:
            dialect_mapping[lang["ID"]] = lang["Dialect_Of"]
        lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]}
        glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}
        lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}

    def get_lang_id(key):
        if key in lang_shorthands:
            lang_id = lang_shorthands[key]["ID"]
        elif key in glottocodes:
            lang_id = glottocodes[key]["ID"]
        elif key in lang_ids:
            lang_id = key
        else:
            print("Could not identify language %s" % key)
            return None
        if lang_id in dialect_mapping:
            lang_id = dialect_mapping[lang_id]
        return lang_id

    def get_key_and_page(source_string):
        if len(source_string.split("[")) > 1:
            bib_key = source_string.split("[")[0]
            pages = source_string.split("[")[1].split("]")[0]
        else:
            bib_key = source_string
            pages = ""
        return bib_key, pages

    print("Adding sources…")
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    print("Adding language sources…")
    DBSession.flush()
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        if "keywords" in rec:
            for keyword in rec["keywords"].split(","):
                if keyword in lang_shorthands:
                    lang_id = get_lang_id(keyword.strip(" "))
                    if lang_id in data["Language"]:
                        data.add(common.LanguageSource,
                        rec.id+lang_id,
                        language_pk=data["Language"][lang_id].pk,
                        source_pk=data["Source"][rec.id].pk
                        )
        
    data.add(
        common.Source,
        "pc",
        id="pc",
        name="Personal communication",
        description="Placeholder for data obtained from personal communication.",
        bibtex_type=bibtex.EntryType.misc
    )

#     print("Adding glossing abbreviations…")
#     length = len(pynterlinear.get_all_abbrevs().keys())
#     for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()):
#         print("%s/%s" % (i+1, length), end="\r")
#         DBSession.add(common.GlossAbbreviation(id=key, name=name))
#     print("")
#
    print("Adding examples…")
    gloss_replacements = {
        "S_A_": "Sa",
        "S_P_": "Sp"
    }
    def clldify_glosses(gloss_line):
        for orig, new in gloss_replacements.items():
            gloss_line = gloss_line.replace(orig,new)
        gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line)
        return gloss_line

    for ex in args.cldf["ExampleTable"]:
        lang_id = get_lang_id(ex["Language_ID"])
        new_ex = data.add(common.Sentence,
            ex["ID"],
            id=ex["ID"],
            name=ex["Name"],
            description=ex["Translated_Text"],
            analyzed="\t".join(ex["Analyzed_Word"]),
            gloss=clldify_glosses("\t".join(ex["Gloss"])),
            language=data["Language"][lang_id],
            comment=ex["Comment"],
            markup_gloss="\t".join(ex["Morpheme_IDs"])
        )
        
        if ex["Source"]:
            bib_key, pages = get_key_and_page(ex["Source"])
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(common.SentenceReference(
                    sentence=new_ex,
                    source=source,
                    key=source.id,
                    description=pages.replace("--","–"))
                )

    def add_morpheme_reference(morpheme, source_string):
        bib_key, pages = get_key_and_page(source_string)
        if bib_key in data["Source"]:
            source = data["Source"][bib_key]
            DBSession.add(models.MorphemeReference(
                morpheme=morpheme,
                source=source,
                key=source.id,
                description=pages.replace("--","–")
                )
            )

    print("Adding morphemes…")
    for morph in args.cldf["FormTable"]:
        lang_id = get_lang_id(morph["Language_ID"])
        form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ")
        new_morph = data.add(models.Morpheme,
            morph["ID"],
            morpheme_type="grammatical",
            language=data["Language"][lang_id],
            name="/".join(form),
            id=morph["ID"],
        )
        
        if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0])

    print("Adding constructions…")
    data.add(models.DeclarativeType, "imp", id="imp", name="imperative")
    data.add(models.DeclarativeType, "decl", id="decl", name="declarative")
    data.add(models.MainClauseVerb, "y", id="y", name="main clause construction")
    data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction")

    for cons in args.cldf["ParameterTable"]:
        lang_id = get_lang_id(cons["Language_ID"])
        new_construction = data.add(
            models.Construction,
            cons["ID"],
            id=cons["ID"],
            language=data["Language"][lang_id],
            name=cons["Description"],
            mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]],
        )
        if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]]

    def add_morph_func(morpheme, func_key, construction):
        data.add(models.MorphemeFunction,
            "%s:%s" % (morpheme, function),
            id="%s:%s" % (morpheme, func_key),
            name="MorphemeFunction %s:%s"% (morpheme, func_key),
            unit=data["Morpheme"][morpheme],
            unitparameter=data["Meaning"][function],
            construction=construction
        )

    print("Adding morpheme functions…")
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            func_key = function.replace(".","_")
            if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"):
                meaning_type="inflectional"
            else:
                meaning_type="derivational"
            if function not in data["Meaning"]:
                data.add(models.Meaning,
                    function,
                    id=func_key,
                    name=function,
                    meaning_type=meaning_type
                )
            #Only some morpheme functions are specified as occurring in specific constructions
            if len(morph_func["Construction"]) == 0:
                for morpheme in morph_func["Morpheme"]:
                    add_morph_func(morpheme, func_key, None)
            else:
                for construction in morph_func["Construction"]:
                    if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?":
                        for morpheme in morph_func["Morpheme"]:
                            if data["Morpheme"][morpheme].language != data["Construction"][construction].language:
                                print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % (
                                data["Morpheme"][morpheme].language,
                                data["Morpheme"][morpheme],
                                data["Construction"][construction].language,
                                data["Construction"][construction]
                                )
                                )
                            cons_func_key = func_key + ":" + construction
                            add_morph_func(morpheme, cons_func_key, data["Construction"][construction])

    print("Checking examples for illustrated morphemes…")
    proto_languages = ["pc"]
    is_illustrated = {}
    for key, row in data["MorphemeFunction"].items():
        if row.unit.language.id in proto_languages: continue
        is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False
    for row in args.cldf["ExampleTable"]:
        for word in row["Morpheme_IDs"]:
            morph_ids = util.split_word(word)
            for unit_value in morph_ids:
                if unit_value in ["X","-","=", "~"]:
                    continue
                unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"])
                if unitvaluesentence_key in data["UnitValueSentence"].keys():
                    continue
                is_illustrated[unit_value] = True
                morph_id = unit_value.split(":")[0]
                if morph_id not in data["Morpheme"].keys():
                    print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id))
                elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language:
                    print("Warning: The %s example %s claims to contain the %s morpheme %s." % (
                        data["Sentence"][row["ID"]].language,
                        row["ID"],
                        data["Morpheme"][morph_id].language,
                        data["Morpheme"][morph_id]
                    )
                    )
                if ":" not in unit_value:
                    print("%s in %s contains no defined function!" % (unit_value, row["ID"]))
                function = unit_value.split(":")[1]
                morph_function_id = "%s:%s" % (morph_id, function)
                if morph_function_id not in data["MorphemeFunction"].keys():
                    print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-")))
                    continue
                data.add(models.UnitValueSentence,
                unitvaluesentence_key,
                sentence=data["Sentence"][row["ID"]],
                unitvalue=data["MorphemeFunction"][morph_function_id],
                )


    # see how many morpheme functions are illustrated with example sentences
    good_ill = [key for key, value in is_illustrated.items() if value]
    not_ill = [key for key, value in is_illustrated.items() if not value]
    not_ill.sort()
    cov = len(good_ill)/len(is_illustrated)*100
    print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2)))
    f = open("../unillustrated_morphemes.txt", "w")
    for morph in not_ill:
        f.write(morph+"\n")
    f.close()

    print("Adding cognate sets…")
    for cogset in args.cldf["CognatesetTable"]:
        new_cset = data.add(models.Cognateset,
            cogset["ID"],
            id=cogset["ID"],
            name=cogset["Name"],
            description=cogset["Function"],
            cogset_type="grammatical"
        )
        if cogset["Source"]:
            for source in cogset["Source"]:
                bib_key, pages = get_key_and_page(source)
                if bib_key in data["Source"]:
                    source = data["Source"][bib_key]
                    DBSession.add(models.CognatesetReference(
                        cognateset=new_cset,
                        source=source,
                        key=source.id,
                        description=pages)
                        )

    print("Adding cognates…")
    for morph in args.cldf["FormTable"]:
        for cognate_ID in morph["Cognateset_ID"]:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognate_ID],
                    counterpart=data["Morpheme"][morph["ID"]]
                    )
            )

    print("Adding morpheme comments…")
    for row in args.cldf["FormTable"]:
        data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"])

    print("Adding construction descriptions…")
    for cons in args.cldf["ParameterTable"]:
        if cons["Comment"] is None:
            description = ""
        else:
            description = util.generate_markup(cons["Comment"])
        description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"]))
        description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"]))
        data["Construction"][cons["ID"]].markup_description = description


    print("Adding cognate set descriptions…")
    for cogset in args.cldf["CognatesetTable"]:
        data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"])
        # if cogset["ID"] == "13pro":
        #     data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup(
        #         util.comparative_function_paradigm(
        #             ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"],
        #             "1+3 scenarios",
        #             ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"]))

    
    def add_tree_labels(phylo):
        uncertain_nodes = []
        for node in phylo.find_clades():
            if node.name == None or not node.is_terminal():
                continue
            plain_name = node.name.replace("?","")
            if "?" in node.name: uncertain_nodes.append(plain_name)
            if plain_name in lang_ids:
                node.name = lang_ids[plain_name]["Name"].replace("'", "’")
            if plain_name in uncertain_nodes: node.name += "?"
        return phylo, uncertain_nodes
        
    print("Adding trees…")
    own_trees = ["matter"]
    tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw')
    newick_files = {}
    for tree in args.cldf["cariban_trees.csv"]:
        if tree["ID"] in own_trees: continue
        newick_files[tree["ID"]] = {
            "orig": tree["ID"]+"_orig.newick",
            "norm": tree["ID"]+"_norm.newick",
            "source": tree["Source"],
            "comment": tree["Comment"],
            "o_comment": tree["Orig_Comment"]
        }
    #adding my own trees separately.
    for my_tree_count, tree_id in enumerate(own_trees):
        my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick")
        my_tree, uncertain_nodes = add_tree_labels(my_tree)
        
        edited_tree = io.StringIO()
        Phylo.write(my_tree, edited_tree, "newick")
        tree = edited_tree.getvalue().replace(":0.00000","")
        
        my_phylo = Phylogeny(
                tree_id,
                id=tree_id,
                name="Matter (2020)",# % str(my_tree_count+1),
                newick=tree,
                markup_description="My own, conservative, classification."
        )
        
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=my_phylo
                )
            )
              
        DBSession.add(my_phylo)
        
    #adding the other trees
    for tree_id, values in newick_files.items():
        norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick")
        orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick")
        
        norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree)
            
        edited_tree = io.StringIO()
        Phylo.write(norm_biotree, edited_tree, "newick")
        norm_tree = edited_tree.getvalue().replace(":0.00000","")
        
        edited_tree = io.StringIO()
        Phylo.write(orig_biotree, edited_tree, "newick")
        orig_tree = edited_tree.getvalue().replace(":0.00000","")
        
        norm_phylo = Phylogeny(
                id=tree_id+"_norm",
                name=str(data["Source"][values["source"]]) + " (Normalized)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id +
                util.generate_markup(
                    "<br>Comments: %s" % values["comment"]
                ),
                newick=norm_tree
        )
        
        if values["o_comment"] == None:
            o_comment = ""
        else:
            o_comment = values["o_comment"]
        orig_phylo = Phylogeny(
                id=tree_id+"_orig",
                name=str(data["Source"][values["source"]]) + " (Original)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                    "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id +
                    util.generate_markup(
                    "<br>Comments: %s" % values["comment"] +
                    " " + o_comment
                    ),
                newick=orig_tree
        )
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=norm_phylo
                )
            )
        DBSession.add(norm_phylo)
        DBSession.add(orig_phylo)

    print("Adding t-adding verb cognate sets…")
    for t_verb_set in args.cldf["cariban_t_cognates.csv"]:
        cognate_ID = "t"+t_verb_set["ID"]
        rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"])
        t_cogset = data.add(models.Cognateset,
            cognate_ID,
            id=cognate_ID,
            name=rec_t_form,
            description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"],
            cogset_type="t_adding"
        )
        if t_verb_set["Source"]:
            bib_key = t_verb_set["Source"].split("[")[0]
            if len(t_verb_set["Source"].split("[")) > 1:
                pages = t_verb_set["Source"].split("[")[1].split("]")[0]
            else:
                pages = " "
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(models.CognatesetReference(
                    cognateset=t_cogset,
                    source=source,
                    key=source.id,
                    description=pages)
                    )
    
    print("Adding t-adding verbs…")
    t_langs = {}
    t_verbs = {}
    non_t_adding_lgs = ["ing","mac","kar","wmr","pan"]
    data.add(models.Meaning,
        "t_verb",
        id="t-verb",
        name="t-adding verb",
    )
    for t_verb_entry in args.cldf["cariban_t_verbs.csv"]:
        if t_verb_entry["Language_ID"] == "cari1283": continue
        cognate_ID = "t"+t_verb_entry["Cognateset_ID"]
        lang_id = get_lang_id(t_verb_entry["Language_ID"])
        morph_id = lang_id+"_"+cognate_ID
        if morph_id in data["Morpheme"].keys():
            if morph_id + "_2" in data["Morpheme"].keys():
                morph_id += "_3"
            else:
                morph_id += "_2"
        t_verb = data.add(models.Morpheme,
            morph_id,
            id=morph_id,
            morpheme_type="t_adding",
            name=t_verb_entry["Form"],
            language=data["Language"][lang_id],
        )
        DBSession.add(models.Cognate(
                cognateset=data["Cognateset"][cognate_ID],
                counterpart=t_verb
            )
        )
        if t_verb_entry["t"] == "y":
            t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name)
            t_verb.markup_description = util.generate_markup("Shows cogset:t")
        if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs:
            t_verb.name = "[t-?]"+t_verb.name
            t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t")
        if t_verb_entry["t"] == "n":
            t_verb.markup_description = util.generate_markup("Does not show cogset:t")
        if lang_id not in t_langs.keys():
            t_langs[lang_id] = {"y": 0, "n": 0, "?": 0}
        if cognate_ID not in t_verbs.keys():
            t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0}
        t_langs[lang_id][t_verb_entry["t"]] += 1
        if lang_id not in non_t_adding_lgs:
            t_verbs[cognate_ID][t_verb_entry["t"]] += 1
        if t_verb_entry["Source"]:
            add_morpheme_reference(t_verb, t_verb_entry["Source"])

        data.add(models.MorphemeFunction,
            "t_"+t_verb_entry["ID"],
            id="t_"+t_verb_entry["ID"],
            name="t-Verb %s" % t_verb_entry["Parameter_ID"],
            unit=t_verb,
            unitparameter=data["Meaning"]["t_verb"],
            construction=None
        )
    for lang, values in t_langs.items():
        data["Language"][lang].update_jsondata(t_values=values)
    for verb, values in t_verbs.items():
        # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))
        data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"])))

    print("Adding reconstructed lexemes…")
    proto_forms = {}
    for cogset in args.cldf["cariban_lexical_reconstructions.csv"]:
        proto_forms[cogset["ID"]] = cogset["Form"]

    first_found = []
    for entry in args.cldf["cariban_swadesh_list.csv"]:
        cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"]
        if cognateset_ID not in data["Cognateset"]:
            if cognateset_ID in proto_forms:
                form = "*" + proto_forms[cognateset_ID].replace("; ", " / ")
            # else:
            #     form = ""
                data.add(models.Cognateset,
                    cognateset_ID,
                    id=cognateset_ID,
                    name=form,
                    description=cognateset_ID,
                    cogset_type="lexical"
                )
        lang_id = get_lang_id(entry["Language_ID"])
        if lang_id not in data["Language"]: continue
        function = entry["Parameter_ID"].replace(".","_")
        morph_id = entry["Language_ID"] + "_" + function
        if morph_id in first_found: continue
        first_found.append(morph_id)
        if function not in data["Meaning"].keys():
            data.add(models.Meaning,
                function,
                id=function,
                name=function,
                meaning_type="lexical"
            )
        morpheme = data.add(models.Morpheme,
                    morph_id,
                    id=morph_id,
                    morpheme_type="lexical",
                    name=entry["Value"][0],
                    language=data["Language"][lang_id],
                )
        data.add(models.MorphemeFunction,
            "%s:%s" % (morph_id, function),
            id="%s:%s" % (morph_id, function),
            name="MorphemeFunction %s:%s"% (morph_id, function),
            unit=data["Morpheme"][morph_id],
            unitparameter=data["Meaning"][function],
            construction=None
        )
        if entry["Source"]:
            add_morpheme_reference(morpheme, entry["Source"])
        
        if cognateset_ID in proto_forms:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognateset_ID],
                    counterpart=morpheme
                )
            )
Example #2
0
def main(args):  # pragma: no cover
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')
    clts = CLTS(
        input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data')
    data = Data()
    ds = data.add(
        common.Dataset,
        vanuatuvoices.__name__,
        id=vanuatuvoices.__name__,
        name='Vanuatu Voices',
        domain='vanuatuvoices.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg')

    r = get_dataset('vanuatuvoices', ep='lexibank.dataset')
    authors, _ = r.get_creators_and_contributors()
    for ord, author in enumerate(authors):
        cid = slug(HumanName(author['name']).last)
        img = pathlib.Path(
            vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid)
        c = data.add(
            common.Contributor,
            cid,
            id=cid,
            name=author['name'],
            description=author.get('description'),
            jsondata=dict(img=img.name if img.exists() else None),
        )
    data.add(
        common.Contributor,
        'forkel',
        id='forkel',
        name='Robert Forkel',
        description='Data curation and website implementation',
        jsondata=dict(img=None),
    )
    for ord, cid in enumerate(['walworth', 'forkel', 'gray']):
        DBSession.add(
            common.Editor(ord=ord,
                          dataset=ds,
                          contributor=data['Contributor'][cid]))

    contribs = collections.defaultdict(lambda: collections.defaultdict(list))
    for c in args.cldf.iter_rows('contributions.csv'):
        for role in ['phonetic_transcriptions', 'recording', 'sound_editing']:
            for name in c[role].split(' and '):
                if name:
                    cid = slug(HumanName(name).last)
                    contribs[c['Language_ID']][cid].append(role)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        contrib = data.add(
            common.Contribution,
            lang['id'],
            id=lang['id'],
            name='Wordlist for {}'.format(lang['name']),
        )
        if lang['id'] in contribs:
            for cid, roles in contribs[lang['id']].items():
                DBSession.add(
                    common.ContributionContributor(
                        contribution=contrib,
                        contributor=data['Contributor'][cid],
                        jsondata=dict(roles=roles),
                    ))
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            contribution=contrib,
            island=lang['Island'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            description=param['Bislama_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    inventories = collections.defaultdict(collections.Counter)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        inventories[form['languageReference']].update(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=data['Contribution'][form['languageReference']],
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(Counterpart,
                 form['id'],
                 id=form['id'],
                 name=form['form'],
                 valueset=vs,
                 audio=form2audio.get(form['id']))

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv
                       if getattr(c, 'name', None)])
Example #3
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
Example #4
0
def main(args):
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        papuanvoices.__name__,
        id=papuanvoices.__name__,
        domain='papuanvoices.clld.org',
        name="Papuan Voices",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    for i, ed in enumerate(['gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            description=lang['LongName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Example #5
0
def main(args):
    _ = args
    data = Data()
    cldf_data = args.cldf

    data.add(common.Contributor,
             'fehnannemarie',
             id='fehnannemarie',
             name="Anne-Marie Fehn",
             url="https://shh.mpg.de")

    # TODO: Editors/Contributors
    dataset = common.Dataset(id=kba.__name__,
                             name="KBA",
                             publisher_name="Max Planck Institute for the "
                             "Science of Human History",
                             publisher_place="Jena",
                             publisher_url="http://www.shh.mpg.de",
                             license="http://creativecommons.org/licenses/by"
                             "/4.0/",
                             domain='kba.clld.org',
                             jsondata={
                                 'license_icon':
                                 'cc-by.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution 4.0 '
                                 'International '
                                 'License'
                             })

    DBSession.add(dataset)

    for i, editor in enumerate(['fehnannemarie']):
        common.Editor(dataset=dataset,
                      contributor=data['Contributor'][editor],
                      ord=i + 1)

    contrib = common.Contribution(id='contrib', name='the contribution')

    for language in cldf_data['LanguageTable']:
        lang = data.add(models.KbaLanguage,
                        language['ID'],
                        id=language['ID'],
                        name=language['Name'])
        add_language_codes(data, lang, None, glottocode=language['Glottocode'])

    # TODO: Concepticon
    for parameter in cldf_data['ParameterTable']:
        data.add(common.Parameter,
                 parameter['ID'],
                 id=parameter['ID'],
                 name='{0} ({1})'.format(parameter['Name'], parameter['ID']))

    for form in cldf_data['FormTable']:
        valueset_id = '{0}-{1}'.format(form['Parameter_ID'],
                                       form['Language_ID'])
        valueset = data['ValueSet'].get(valueset_id)

        # Unless we already have something in the VS:
        if not valueset:
            valueset = data.add(
                common.ValueSet,
                valueset_id,
                id=valueset_id,
                language=data['KbaLanguage'][form['Language_ID']],
                parameter=data['Parameter'][form['Parameter_ID']],
                contribution=contrib)

        DBSession.add(
            models.Word(id=form['ID'],
                        name=form['Form'],
                        comment=form.get('Comment'),
                        sourceorthography=form.get('sourceorthography'),
                        kbaorthography=form.get('kbaorthography'),
                        wordclass=form.get('wordclass'),
                        grammaticalnotes=form.get('grammaticalnotes'),
                        idiolectalvariant=form.get('idiolectalvariant'),
                        originaltranslation=form.get('originaltranslation'),
                        valueset=valueset))

    load_families(data,
                  [(l.glottocode, l) for l in data['KbaLanguage'].values()],
                  glottolog_repos=args.glottolog,
                  isolates_icon='tcccccc')
Example #6
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'

    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    data = Data()
    ds = data.add(
        common.Dataset,
        lsi.__name__,
        id=lsi.__name__,
        name=
        'The Comparative Vocabularies of the "Linguistic Survey of India" Online',
        domain='lsi.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(
        ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            order=int(lang['Order']),
            number=lang['NumberInSource'],
            family_in_source=lang['FamilyInSource'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            description=param['Concepticon_Gloss'],
            concepticon_id=param['concepticonReference'],
            pages=param['PageNumber'],
        )

    inventories = collections.defaultdict(set)
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        inventories[form['languageReference']] = inventories[
            form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            models.Form,
            form['id'],
            id=form['id'],
            name=form['form'],
            description=''.join(form['Segments']).replace('+', ' '),
            segments=' '.join(form['Segments']),
            valueset=vs,
        )
    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name)
                                                        for c in inv
                                                        if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Example #7
0
def main(args):
    data = Data()

    icons = cycle(ORDERED_ICONS)

    dataset = common.Dataset(
        id=gelato.__name__,
        name="GeLaTo",
        description="Genes and Languages together",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='gelato.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'),
                                     ('blasidamian', 'Damián Blasi'),
                                     ('forkelrobert', 'Robert Forkel')]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    families = {}

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for r in args.cldf.iter_rows('ContributionTable', 'id', 'name',
                                 'description'):
        ds = data.add(models.Panel,
                      r['id'],
                      id=r['id'],
                      name=r['name'],
                      description=r['description'])
    for row in args.cldf.iter_rows('LanguageTable', 'id', 'name',
                                   'contributionReference'):
        icon = families.get(row['LanguageFamily_Glottocode'])
        if not icon:
            families[row['LanguageFamily_Glottocode']] = icon = next(icons)
        lang = data['Languoid'].get(row['Glottocode'])
        if not lang:
            lang = data.add(
                models.Languoid,
                row['Glottocode'],
                id=row['Glottocode'],
                name=row['Language_Name'],
                family_id=row['LanguageFamily_Glottocode'],
                family_name=row['LanguageFamily'],
                jsondata=dict(icon=icon.name),
            )
        s = data.add(
            models.Sample,
            row['id'],
            id=row['id'],
            name=row['Name'],
            panel=data['Panel'][row['contributionReference']],
            languoid=lang,
            latitude=row['Latitude'],
            longitude=row['Longitude'],
            samplesize=int(row['samplesize']),
            #source=row.get('dataSet.of.origin'),
            region=row['geographicRegion'],
            #location=row['location'],
            jsondata=dict(color=REGIONS[row['geographicRegion']]),
        )
        DBSession.flush()
        for bibkey in row['Source']:
            DBSession.add(
                common.LanguageSource(language_pk=s.pk,
                                      source_pk=data['Source'][bibkey].pk))

    types = {}
    for row in args.cldf.iter_rows('ParameterTable', 'id', 'name',
                                   'description', 'contributionReference'):
        types[row['id']] = Datatype.fromvalue(row['datatype'])
        data.add(models.Measure,
                 row['id'],
                 id=row['id'],
                 name=row['name'],
                 description=row['description'],
                 panel=data['Panel'][row['contributionReference']])

    for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference',
                                   'languageReference'):
        v = types[row['parameterReference']].read(row['Value'])
        if isinstance(v, float):
            vs = data.add(
                common.ValueSet,
                row['id'],
                id=row['id'],
                language=data['Sample'][row['languageReference']],
                parameter=data['Measure'][row['parameterReference']],
                #contribution=ds,
                #jsondata=dict(color=REGIONS[sample.region]),
            )
            data.add(models.Measurement,
                     row['id'],
                     id=row['id'],
                     valueset=vs,
                     name=row['Value'],
                     value=v)
Example #8
0
def main(args):  # pragma: no cover
    data = Data()
    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    ds = data.add(
        common.Dataset,
        tppsr.__name__,
        id=tppsr.__name__,
        name='Tableaux phonétiques des patois suisses romands Online',
        domain='tppsr.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'},
    )
    for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(
            dataset=ds,
            ord=i,
            contributor=common.Contributor(id=slug(HumanName(name).last), name=name)
        )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['Number'],
            name=lang['name'],
            description=lang['FullName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            canton=lang['Canton'],
            group=lang['DialectGroup'],
            recorded=lang['DateOfRecording'],
            population=int(lang['Population']) if lang['Population'] else None,
            speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None,
            speaker_proficiency=lang['SpeakerProficiency'],
            speaker_language_use=lang['SpeakerLanguageUse'],
            speaker_gender=lang['SpeakerGender'],
            investigators=lang['Investigators'],
        )
    colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol')
    for i, (_, langs) in enumerate(itertools.groupby(
        sorted(data['Variety'].values(), key=lambda l: l.canton),
        lambda l: l.canton,
    )):
        for lang in langs:
            lang.update_jsondata(color=colors[i])

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['Number'],
            number=int(param['Number']),
            name='{} [{}]'.format(param['name'], param['Number']),
            latin_gloss=param['Latin_Gloss'],
            french_gloss=param['French_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            concepticon_concept_id=param['id'].split('_')[0],
        )

    inventories = collections.defaultdict(set)
    scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl
    for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'):
        if not form['form']:
            continue
        inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        f = data.add(
            models.Form,
            form['id'],  # Gauchat-1925-480-1_
            id=form['id'],
            name=form['form'].replace('+', ' '),
            description=form['value'],
            segments=' '.join(form['Segments']),
            valueset=vs,
            scan=scan_url_template.expand(**form),
            prosodic_structure=form['ProsodicStructure'],
        )

    for example in args.cldf['ExampleTable']:
        sentence = models.Phrase(
            id=example['ID'],
            language=data['Variety'][example['Language_ID']],
            name=example['Primary_Text'],
            description=example['Translated_Text'],
            original_script=example['Alt_Transcription'],
        )
        for cid in example['Concept_ID']:
            DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence))
        for fid in example['Form_ID']:
            DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(common.ValueSetReference(
            valueset=data['ValueSet'][vsid],
            source=data['Source'][sid],
            description='; '.join(nfilter(pages))
        ))
Example #9
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    data = Data()
    ds = data.add(
        common.Dataset,
        mixezoqueanvoices.__name__,
        id=mixezoqueanvoices.__name__,
        name="Mixe-Zoquean Voices",
        domain='mixezoqueanvoices.clld.org',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )
    data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic')
    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    DBSession.add(
        common.ContributionContributor(
            contribution=contrib,
            contributor=data['Contributor']['kondic'],
        ))
    for i, ed in enumerate(['kondic', 'gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    ancestors = collections.defaultdict(list)
    gl = Glottolog(args.glottolog)
    lnames = {}
    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        lnames[lang['id']] = lang['name']
        glang = None
        if lang['glottocode']:
            glang = gl.languoid(lang['glottocode'])
            lineage = [i[0] for i in glang.lineage]
            if 'Mixe-Zoque' in lineage:
                ancestors[lang['id']].append('Protomixezoque')
            if 'Mixe' in lineage:
                ancestors[lang['id']].append('Protomixe')
            if 'Oaxaca Mixe' in lineage:
                ancestors[lang['id']].append('Protooaxacamixe')
        if not glang:
            assert lang['name'] == 'Nizaviguiti'
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            subgroup=glang.lineage[1][0]
            if glang and len(glang.lineage) > 1 else None,
        )
    colors = dict(
        zip(
            set(l.subgroup for l in data['Variety'].values()),
            qualitative_colors(
                len(set(l.subgroup for l in data['Variety'].values())))))
    for l in data['Variety'].values():
        l.jsondata = dict(color=colors[l.subgroup].replace('#', ''))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    # Store proto-forms for later lookup:
    proto_forms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference'):
        if form['languageReference'].startswith('Proto'):
            proto_forms[form['languageReference']][
                form['parameterReference']].append(form['form'])

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        proto = collections.OrderedDict()
        for lid, forms in proto_forms.items():
            f = forms.get(param['id'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            description=param['Spanish_Gloss'],
            jsondata=dict(reconstructions=proto),
        )

    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        assert not (form['form'] == '►' and not f2a.get(form['id']))
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        proto = collections.OrderedDict()
        for lid in ancestors.get(form['languageReference'], []):
            f = proto_forms[lid].get(form['parameterReference'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
            jsondata=dict(reconstructions=proto),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
Example #10
0
def main(args):  # pragma: no cover
    get_repos()
    api = Grambank(REPOS['Grambank'])
    cldf = args.cldf
    data = Data()
    dataset = models.Grambank(
        id=grambank.__name__,
        name="Grambank",
        description="Grambank",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    contributors = {}
    for i, contrib in enumerate(api.contributors):
        contrib = common.Contributor(
            contrib.id,
            id=contrib.id,
            name=contrib.name,
        )
        common.Editor(dataset=dataset, contributor=contrib, ord=i)
        DBSession.add(contrib)
        DBSession.flush()
        contributors[contrib.id] = contrib.pk
    contributions = {r['ID']: r for r in cldf['LanguageTable']}

    DBSession.add(dataset)

    for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)),
                    desc='sources'):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()
    sources = {k: v.pk for k, v in data['Source'].items()}

    features, codes = import_features(cldf, contributors)
    transaction.commit()

    values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby(
        sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']),
        lambda r: r['Language_ID'],
    )]
    for lid, values in tqdm(values_by_sheet, desc='loading values'):
        transaction.begin()
        import_values(values, contributions[lid], features, codes,
                      contributors, sources)
        transaction.commit()

    transaction.begin()

    glottolog = Glottolog(REPOS['glottolog'])
    languoids = {l.id: l for l in glottolog.languoids()}
    gblangs = DBSession.query(models.GrambankLanguage).all()
    load_families(data,
                  gblangs,
                  glottolog_repos=REPOS['glottolog'],
                  isolates_icon='dcccccc')

    # Add isolates
    for lg in gblangs:
        gl_language = languoids.get(lg.id)
        if not gl_language.family:
            family = data.add(
                Family,
                gl_language.id,
                id=gl_language.id,
                name=gl_language.name,
                description=common.Identifier(
                    name=gl_language.id,
                    type=common.IdentifierType.glottolog.value).url(),
                jsondata={"icon": 'tcccccc'})
            lg.family = family
    coverage.main(glottolog)
    return
Example #11
0
def main(args):
    data = Data()
    doi = input('DOI of the released dataset: ')

    dataset = common.Dataset(
        id=ewave.__name__,
        name='eWAVE',
        description='The Electronic World Atlas of Varieties of English',
        domain='ewave-atlas.org',
        published=date.today(),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'doi': doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)

    ed_pattern = re.compile('ed(?P<ord>[0-9]+)$')
    for c in args.cldf['contributors.csv']:
        contrib = data.add(
            models.WaveContributor,
            c['ID'],
            id=c['ID'],
            name=c['Name'],
            email=c['Email'],
            url=c['URL'],
            address=c['Address'],
            sortkey=HumanName(c['Name']).last,
        )
        m = ed_pattern.match(c['ID'])
        if m:
            common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord')))

    for fc in args.cldf['featurecategories.csv']:
        data.add(
            models.FeatureCategory, fc['ID'],
            id=fc['ID'], name=fc['Name'], description=fc['Description'])

    for vt in args.cldf['varietytypes.csv']:
        data.add(
            models.VarietyType, vt['ID'],
            id=vt['ID'],
            name=vt['Name'],
            description=vt['Description'],
            jsondata=VARIETY_TYPE_ICONS[vt['ID']],
        )

    for vt in args.cldf['regions.csv']:
        data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name'])

    for lang in args.cldf['LanguageTable']:
        l = data.add(
            models.Variety, lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
            latitude=lang['Latitude'],
            longitude=lang['Longitude'],
            abbr=lang['abbr'],
            region=data['Region'][lang['Region_ID']],
            type=data['VarietyType'][lang['Type_ID']],
        )
        if lang['Glottocode']:
            add_language_codes(data, l, None, glottocode=lang['Glottocode'])
        c = data.add(
            models.WaveContribution, lang['ID'],
            id=str(lang['ID']),
            name=lang['Name'],
            description=lang['Description'],
            variety=l)
        for i, cid in enumerate(lang['Contributor_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=data['WaveContributor'][cid],
                ord=i+1,
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            models.Feature, param['ID'],
            id=param['ID'],
            category=data['FeatureCategory'][param['Category_ID']],
            name=param['Name'],
            description=param['Description'],
            jsondata={'example_source': param['Example_Source']})


    for de in args.cldf['CodeTable']:
        data.add(
            common.DomainElement, de['ID'],
            id=de['ID'],
            parameter=data['Feature'][de['Parameter_ID']],
            name=de['Name'],
            description=de['Description'],
            jsondata={'color': CODE_COLORS[de['Name']]},
            number=de['Number'])

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for example in args.cldf['ExampleTable']:
        s = data.add(
            common.Sentence, example['ID'],
            id=example['ID'],
            name=example['Primary_Text'],
            gloss='\t'.join(example['Gloss']) if example['Gloss'] else None,
            comment=example['Comment'] or None,
            description=example['Translated_Text'] or None,
            language=data['Variety'][example['Language_ID']])

        for ref in example['Source']:
            sid, pages = Sources.parse(ref)
            DBSession.add(common.SentenceReference(
                sentence=s, source=data['Source'][sid], description=pages, key=sid))

    for value in args.cldf['ValueTable']:
        de = data['DomainElement'][value['Code_ID']]
        vs = data.add(
            common.ValueSet, value['ID'],
            id=value['ID'],
            contribution=data['WaveContribution'][value['Language_ID']],
            parameter=data['Feature'][value['Parameter_ID']],
            jsondata=de.jsondata,
            language=data['Variety'][value['Language_ID']])
        v = data.add(
            common.Value, value['ID'],
            id=value['ID'],
            domainelement=de,
            valueset=vs)

        for eid in value['Example_ID']:
            DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
Example #12
0
def main(args):
    internal = input(
        '[i]nternal or [e]xternal data (default: e): ').strip().lower() == 'i'
    which_submission = input(
        "submission id or 'all' for all submissions (default: all): ").strip(
        ).lower() or 'all'

    data = Data()

    dataset = common.Dataset(
        id=crossgram.__name__,
        name='Crossgram',
        description='Crossgram',
        published=date(2019, 12, 12),
        domain='crossgram.clld.org',
        # XXX Is any of this correct?
        publisher_name='Max Planck Institute for the Science of Human History',
        publisher_place='Jena',
        publisher_url='https://ssh.mpg.de',
        license='http://creativecommons.org/licenses/by/4.0',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    internal_repo = pathlib.Path('../../crossgram/crossgram-internal')
    cache_dir = internal_repo / 'datasets'
    cache_dir.mkdir(exist_ok=True)

    if internal:
        submissions_path = internal_repo / 'submissions-internal'
    else:
        submissions_path = internal_repo / 'submissions'

    language_id_map = {}
    for contrib_dir in submissions_path.iterdir():
        if not contrib_dir.is_dir():
            continue
        if which_submission != 'all' and which_submission != contrib_dir.name:
            continue
        sid = contrib_dir.name
        print('Loading submission', sid, '...')

        contrib_md = jsonlib.load(contrib_dir / 'md.json')
        intro = None
        try:
            with (contrib_dir / 'intro.md').open(encoding='utf-8') as f:
                intro = f.read()
        except IOError:
            # If there is no intro, there is no intro *shrug*
            pass

        path = download_data(sid, contrib_md, cache_dir)
        if not path.exists():
            print('could not find folder', str(path))
            continue

        submission = CLDFBenchSubmission.load(path, contrib_md)

        date_match = re.fullmatch('(\d+)-(\d+)-(\d+)', contrib_md['published'])
        assert date_match
        yyyy, mm, dd = date_match.groups()
        published = date(int(yyyy), int(mm), int(dd))

        # strip off ssh stuff off git link
        git_https = re.sub('^git@([^:]*):', r'https://\1/',
                           contrib_md.get('repo') or '')

        contrib = data.add(models.CrossgramData,
                           sid,
                           id=sid,
                           number=int(contrib_md['number']),
                           published=published,
                           name=submission.title,
                           doi=contrib_md.get('doi'),
                           git_repo=git_https,
                           description=intro or submission.readme)

        submission.add_to_database(data, language_id_map, contrib)
        print('... done')

    DBSession.flush()
    print('Loading language family data...')
    catconf = cldfcatalog.Config.from_file()
    glottolog_path = catconf.get_clone('glottolog')
    load_families(Data(), [
        v for v in DBSession.query(models.Variety)
        if re.fullmatch('[a-z]{4}[0-9]{4}', v.id)
    ],
                  strict=False,
                  glottolog_repos=glottolog_path)
    print('... done')
Example #13
0
def main(args):

    data = Data()

    dataset = common.Dataset(
        id=amsd.__name__,
        name="AMSD",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='amsd.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    editors = OrderedDict([('Piers Kelly', None)])

    # data_entry => Contributor
    for row in sorted(dicts('data_entry'), key=lambda x: [x['name'].lower()]):
        if row['name'] in editors:
            editors[row['name']] = row['pk']
        data.add(
            common.Contributor,
            row['pk'],
            id=row['pk'],
            name=row['name']
        )

    for i, cid in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1)

    for row in dicts('source_citation'):
        data.add(
            common.Source,
            row['pk'],
            id=row['pk'],
            note=row['name'],
            name=row['name'],
        )

    for row in dicts('ling_area'):
        data.add(
            models.ling_area,
            row['pk'],
            chirila_name=row['chirila_name'],
            austlang_code=row['austlang_code'],
            austlang_name=row['austlang_name'],
            glottolog_code=row['glottolog_code'],
        )
    fd = {}
    for row in dicts('linked_filenames'):
        if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']:
            fd[row['pk']] = dict(
                name=row['name'],
                oid=row['oid'],
                path=row['path'],
                mimetype=mimetypes.guess_type(row['path'])[0] if row['path'] else None,
            )

    for m in 'item_type technique keywords material source_type '\
            'sem_domain holder_file item_subtype cultural_region'.split():
        for row in dicts(m):
            data.add(
                getattr(models, m),
                row['pk'],
                name=row['name'],
            )

    DBSession.flush()

    # sticks => MessageStick
    no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type',
                   'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry',
                   'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file']
    x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords',
              'holder_file', 'item_type', 'item_subtype', 'cultural_region']
    for i, row in enumerate(dicts('sticks')):

        fts_items = []
        for col in row.keys():
            if col:
                if col == 'amsd_id':
                    fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),)
                elif col not in no_fts_cols and not col.endswith('_pk'):
                    fts_items.append(row[col])

        for t in x_cols:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(str(data[t][k]))
                    fts_items.extend(str(data[t][k]).split('_'))

        for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(data['ling_area'][k].chirila_name)
                    fts_items.append(data['ling_area'][k].austlang_code)
                    fts_items.append(data['ling_area'][k].austlang_name)
                    fts_items.append(data['ling_area'][k].glottolog_code)

        if row['source_citation']:
            for k in row['source_citation'].split(';'):
                data.add(
                    common.ContributionReference,
                    k,
                    contribution_pk=int(row['pk']),
                    source_pk=int(k),
                )
                fts_items.append(str(data['Source'][k]))

        if row['linked_filenames']:
            for j, k in enumerate(row['linked_filenames'].split(';')):
                if k in fd:
                    oid = fd[k].get('oid')
                    mt = fd[k].get('mimetype')
                    refobjid = ''
                    if mt == 'application/pdf':
                        refobjid = oid
                        # use for web, thumbnail a place holder image
                        oid = 'EAEA0-52CC-0295-6B71-0'
                    n = fd[k].get('name')
                    data.add(
                        common.Contribution_files,
                        k,
                        id='%s-%s-%i' % (k, row['pk'], j),
                        object_pk=int(row['pk']),
                        name=n,
                        jsondata=dict(
                            original=fd[k].get('path'),
                            objid=oid,
                            refobjid=refobjid,
                            web='web.jpg',
                            thumbnail='thumbnail.jpg',
                        ),
                        ord=j,
                        mime_type=mt,
                    )
                    fts_items.append(n)
                    fts_items.extend(nfilter(re.split(r'[_\-\.]', n)))

        data.add(
            models.MessageStick,
            row['pk'],
            id=row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),
            title=row['title'],
            description=row['description'],
            obj_creator=row['obj_creator'],
            date_created=row['date_created'],
            note_place_created=row['note_place_created'],
            place_created=row['place_created'],
            item_type_pk=row['item_type'] or None,
            item_subtype_pk=row['item_subtype'] or None,
            cultural_region_pk=row['cultural_region'] or None,
            ling_area_1_pk=row['ling_area_1'] or None,
            ling_area_2_pk=row['ling_area_2'] or None,
            ling_area_3_pk=row['ling_area_3'] or None,
            notes_ling_area=row['notes_ling_area'],
            stick_term=row['stick_term'],
            message=row['message'],
            motifs=row['motifs'],
            motif_transcription=row['motif_transcription'],
            dim_1=row['dim_1'],
            dim_2=row['dim_2'],
            dim_3=row['dim_3'],
            date_collected=row['date_collected'],
            holder_file_pk=row['holder_file'] or None,
            holder_obj_id=row['holder_obj_id'],
            collector=row['collector'],
            place_collected=row['place_collected'],
            creator_copyright=row['creator_copyright'],
            file_copyright=row['file_copyright'],
            latitude=row['lat'] or None,
            longitude=row['long'] or None,
            notes_coords=row['notes_coords'],
            url_institution=row['url_institution'],
            url_source_1=row['url_source_1'],
            url_source_2=row['url_source_2'],
            irn=row['irn'],
            notes=row['notes'],
            data_entry=row['data_entry'],
            fts=fts.tsvector('\n'.join(re.sub(r'[_\-]', '.', v) for v in fts_items if v)),
        )

    DBSession.flush()
    for row in dicts('sticks'):
        for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    data.add(
                        getattr(models, 'x_%s' % (t)),
                        k,
                        object_pk=int(row['pk']),
                        item_pk=int(k),
                    )
Example #14
0
def main(args, repos=None):
    glottolog = get_glottolog_api(repos)
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="{0} {1}".format(glottolog.publication.web.name, version),
        publisher_name=glottolog.publication.publisher.name,
        publisher_place=glottolog.publication.publisher.place,
        publisher_url=glottolog.publication.publisher.url,
        license=glottolog.publication.license.url,
        domain=purl.URL(glottolog.publication.web.url).domain(),
        contact=glottolog.publication.web.contact,
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': glottolog.publication.license.name},
    )
    data = Data()

    for e in glottolog.current_editors:
        ed = data.add(common.Contributor, e.id, id=e.id, name=e.name)
        common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord))
    DBSession.add(dataset)

    contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog')
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['hammarstroem']))

    #
    # Add Parameters:
    #
    add = functools.partial(add_parameter, data)
    add('fc', name='Family classification')
    add('sc', name='Subclassification')
    add('aes',
        glottolog.aes_status.values(),
        name=glottolog.aes_status.__defaults__['name'],
        pkw=dict(
            jsondata=dict(
                reference_id=glottolog.aes_status.__defaults__['reference_id'],
                sources=[attr.asdict(v) for v in glottolog.aes_sources.values()],
                scale=[attr.asdict(v) for v in glottolog.aes_status.values()])),
        dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)),
    )
    add('med',
        glottolog.med_types.values(),
        name='Most Extensive Description',
        dekw=lambda de: dict(
            name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)),
    )
    add('macroarea',
        glottolog.macroareas.values(),
        pkw=dict(
            description=glottolog.macroareas.__defaults__['description'],
            jsondata=dict(reference_id=glottolog.macroareas.__defaults__['reference_id'])),
        dekw=lambda de: dict(
            name=de.name,
            description=de.description,
            jsondata=dict(geojson=read_macroarea_geojson(glottolog, de.name, de.description)),
        ),
    )
    add('ltype',
        glottolog.language_types.values(),
        name='Language Type',
        dekw=lambda de: dict(name=de.category, description=de.description),
        delookup='category',
    )
    add('country',
        glottolog.countries,
        dekw=lambda de: dict(name=de.id, description=de.name),
    )

    legacy = jsonlib.load(gc2version())
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    #
    # Now load languoid data, keeping track of relations that can only be inserted later.
    #
    lgsources = defaultdict(list)
    # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level
    # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`:
    nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()])
    lgcodes = {k: v.id for k, v in glottolog.languoids_by_code(nodemap).items()}
    for lang in tqdm(list(nodemap.values())):
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(glottolog, data, lang, nodemap)

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            mas = []
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma)
                mas.append(ma.name)
            ref.macroareas = ', '.join(mas)
def main(args):
    for (org, repos), recs in itertools.groupby(
            sorted(oai.Records('tular'),
                   key=lambda r: (r.repos.org, r.repos.repos, r.version),
                   reverse=True),
            lambda r: (r.repos.org, r.repos.repos),
    ):
        if org == 'tupian-language-resources' and repos in DATASETS:
            DATASETS[repos] = next(recs)

    data = Data()
    dataset = data.add(
        common.Dataset,
        'tular',
        id=tular.__name__,
        domain="tular.clld.org",
        name="TuLaR",
        description="Tupían Language Resources",
        publisher_name="Max-Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        license='https://creativecommons.org/licenses/by-sa/4.0/',
        contact="*****@*****.**",
        jsondata={
            'license_icon':
            'cc-by-sa.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 4.0 International License'
        },
    )

    rd = pathlib.Path(tular.__file__).parent.parent.parent.resolve()
    root = input('Project dir [{}]: '.format(str(rd)))
    root = pathlib.Path(root) if root else rd
    clts = clts_from_input(rd / '..' / 'cldf-clts' / 'clts-data')

    for db, rec in DATASETS.items():
        print(db, rec.doi, rec.tag)
        dbdir = root.joinpath(db)
        assert dbdir.exists()
        md = jsonlib.load(dbdir / 'metadata.json')
        name = md['title']
        if md['description']:
            name += ': {}'.format(md['description'])
        contribution = data.add(
            Database,
            db,
            id=db,
            name=name,
            description=rec.citation if rec else None,
            doi=rec.doi if rec else None,
        )
        header, contribs = next(
            iter_markdown_tables(
                dbdir.joinpath('CONTRIBUTORS.md').read_text(encoding='utf8')))
        for i, contrib in enumerate(contribs):
            contrib = dict(zip(header, contrib))
            cid = slug(HumanName(contrib['Name']).last)
            contributor = data['Contributor'].get(cid)
            if not contributor:
                contributor = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=contrib['Name'],
                    description=contrib.get('Affiliation'),
                )
            DBSession.add(
                common.ContributionContributor(
                    contribution=contribution,
                    contributor=contributor,
                    primary='author' in contrib['Role'].lower(),
                    ord=i,
                ))

    for i, cid in enumerate(
        ['gerardi', 'reichert', 'aragon', 'list', 'forkel']):
        DBSession.add(
            common.Editor(contributor=data['Contributor'][cid],
                          dataset=dataset,
                          ord=i))

    source_ids = list(add_sources(args.cldf.bibpath, DBSession))
    sources = {s.id: s.pk for s in DBSession.query(common.Source)}
    subgroups = []

    for row in args.cldf['LanguageTable']:
        if row['SubGroup'] not in subgroups:
            subgroups.append(row['SubGroup'])
        family = data['Family'].get(row['Family'])
        if (not family) and row['Family']:
            family = data.add(Family,
                              row['Family'],
                              id=slug(row['Family']),
                              name=row['Family'])
        data.add(
            Doculect,
            row['ID'],
            id=row['ID'],
            name=row['Name'].replace('_', ' '),
            family=family,
            subfamily=row['SubGroup'],
            iso_code=row['ISO639P3code'],
            glotto_code=row['Glottocode'],
            longitude=row['Longitude'],
            latitude=row['Latitude'],
            jsondata=dict(icon=SUBGROUPS[row['SubGroup']]),
        )

    tudet = Dataset.from_metadata(root / 'tudet' / 'cldf' /
                                  'Generic-metadata.json')
    seen = set()
    for row in tudet['ExampleTable']:
        if row['ID'] in seen:
            print('skipping duplicate sentence ID {}'.format(row['ID']))
            continue
        seen.add(row['ID'])
        DBSession.add(
            Example(id=row['ID'],
                    name=row['Primary_Text'],
                    description=row['Translated_Text'],
                    language=data['Doculect'][row['Language_ID']],
                    conllu=row['conllu']))

    contrib = data['Database']['tuled']
    for row in args.cldf['ParameterTable']:
        data.add(
            Concept,
            row['ID'],
            id=row['ID'].split('_')[0],
            name=row['Name'],
            portuguese=row['Portuguese_Gloss'],
            semantic_field=row['Semantic_Field'],
            concepticon_class=row['Concepticon_ID'],
            eol=row['EOL_ID'],
        )
    for (lid, pid), rows in itertools.groupby(
            sorted(args.cldf.iter_rows('FormTable', 'languageReference',
                                       'parameterReference'),
                   key=lambda r: (r['Language_ID'], r['Parameter_ID'])),
            lambda r: (r['Language_ID'], r['Parameter_ID']),
    ):
        vsid = '{}-{}'.format(lid, pid)
        vs = data.add(
            common.ValueSet,
            vsid,
            id=vsid,
            language=data['Doculect'][lid],
            parameter=data['Concept'][pid],
            contribution=contrib,
        )
        refs = set()
        for row in rows:
            data.add(
                Word,
                row['ID'],
                id=row['ID'],
                valueset=vs,
                name=row['Form'],
                tokens=' '.join(row['Segments']),
                simple_cognate=int(row['SimpleCognate']),
                notes=row['Comment'],
                morphemes=' '.join(row['Morphemes']),
                partial_cognate=' '.join([k for k in row['PartialCognates']])
                if row['PartialCognates'] else None,
            )
            refs = refs.union(row['Source'])

        for ref in refs:
            if ref in source_ids:
                DBSession.add(
                    common.ValueSetReference(valueset=vs,
                                             source_pk=sources[slug(
                                                 ref, lowercase=False)]))

    load_inventories(args.cldf, clts, data['Doculect'])

    for row in args.cldf['CognateTable']:
        cc = data['Cognateset'].get(row['Cognateset_ID'])
        if not cc:
            cc = data.add(
                Cognateset,
                row['Cognateset_ID'],
                id=row['Cognateset_ID'],
                name=row['Cognateset_ID'],
                contribution=contrib,
            )
        data.add(
            Cognate,
            row['ID'],
            cognateset=cc,
            counterpart=data['Word'][row['Form_ID']],
            alignment=' '.join(row['Alignment'] or []),
        )
Example #16
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cobl2.__name__,
        name="IE-CoR",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='iecor.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    editors = OrderedDict([('Heggarty', None), ('Anderson', None), ('Scarborough', None)])
    for row in sorted(ds['authors.csv'], key=lambda x: [
            x['Last_Name'].lower(), x['First_Name'].lower()]):
        if row['Last_Name'] in editors:
            editors[row['Last_Name']] = row['ID']
        data.add(
            models.Author,
            row['ID'],
            id=row['ID'],
            name='{0} {1}'.format(row['First_Name'], row['Last_Name']),
            url=row['URL'],
            photo=data_uri(photos[row['Last_Name']], 'image/jpg') if row['Last_Name'] in photos else None)

    for i, cid in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=data['Author'][cid], ord=i + 1)

    for src in ds.sources.items():
        for invalid in ['isbn', 'part', 'institution']:
            if invalid in src:
                del src[invalid]
        data.add(
            common.Source,
            src.id,
            id=src.id,
            name=src.get('author', src.get('editor')),
            description=src.get('title', src.get('booktitle')),
            bibtex_type=getattr(EntryType, src.genre, EntryType.misc),
            **src)

    re_links = re.compile(r'\[(?P<label>[^\]]+?)\]\((?P<type>.+?)-(?P<id>\d+)\)')
    link_map = {
        'cog': '/cognatesets/',
        'lex': '/values/',
        'src': '/sources/',
    }

    def parse_links(m):
        try:
            return '<a href="{}{}">{}</a>'.format(
                link_map[m.group('type')], m.group('id'), m.group('label'))
        except KeyError:
            print("parse_links: type error in '{}'".format(":".join(m.groups())))
            return '[{}]({}-{})'.format(m.group('label'), m.group('type'), m.group('id'))

    for param in ds['ParameterTable']:
        data.add(
            models.Meaning,
            param['ID'],
            id=slug(param['Name']),
            name=param['Name'],
            description_md=param['Description_md'],
            concepticon_id=int(param['Concepticon_ID']) if param['Concepticon_ID'] != '0' else None,
        )

    for row in ds['clades.csv']:
        data.add(
            models.Clade,
            row['ID'],
            id=row['ID'],
            level0_name=row['level0_name'],
            level1_name=row['level1_name'],
            level2_name=row['level2_name'],
            level3_name=row['level3_name'],
            clade_level0=row['clade_level0'],
            clade_level1=row['clade_level1'],
            clade_level2=row['clade_level2'],
            clade_level3=row['clade_level3'],
            clade_name=row['clade_name'],
            short_name=row['short_name'],
            color=row['color'],
        )

    for row in ds['LanguageTable']:
        c = data.add(
            common.Contribution,
            row['ID'],
            id=row['ID'],
            name=row['Name'],
        )
        for i, cid in enumerate(row['Author_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c, contributor=data['Author'][cid], ord=i + 1))
        data.add(
            models.Variety,
            row['ID'],
            id=slug(row['Name']),
            name=row['Name'],
            latitude=float(row['Latitude']) if row['Latitude'] is not None else None,
            longitude=float(row['Longitude']) if row['Longitude'] is not None else None,
            contribution=c,
            color=rgb_as_hex(row['Color']),
            clade=', '.join(filter(None, row['Clade'])),
            clade_name=row['clade_name'],
            glottocode=row['Glottocode'],
            historical=row['historical'],
            distribution=row['distribution'],
            logNormalMean=row['logNormalMean'],
            logNormalOffset=row['logNormalOffset'],
            logNormalStDev=row['logNormalStDev'],
            normalMean=row['normalMean'],
            normalStDev=row['normalStDev'],
            ascii_name=row['ascii_name'],
            iso=row['ISO639P3code'],
            lang_description=row['Description'],
            variety=row['Variety'],
            loc_justification=row['loc_justification'] or None,
            sort_order=row['sort_order']
        )

    vsrs = set()
    for row in ds['FormTable']:
        vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID']))
        if not vs:
            vs = data.add(
                common.ValueSet,
                (row['Language_ID'], row['Parameter_ID']),
                id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']),
                language=data['Variety'][row['Language_ID']],
                parameter=data['Meaning'][row['Parameter_ID']],
                contribution=data['Contribution'][row['Language_ID']],
            )
        v = data.add(
            models.Lexeme,
            row['ID'],
            id=row['ID'],
            name=row['Form'],
            native_script=row['native_script'],
            phonetic=row['phon_form'],
            phonemic=row['Phonemic'],
            comment=re_links.sub(parse_links, row['Comment'] or ''),
            url=row['url'],
            gloss=row['Gloss'],
            valueset=vs
        )
        for src in row['Source']:
            sid, pages = ds.sources.parse(src)
            key = (vs.id, sid, pages)
            if pages:
                pages = pages.replace('|', ';')
            if key not in vsrs:
                DBSession.add(common.ValueSetReference(
                    valueset=vs, source=data['Source'][sid], description=pages))
                vsrs.add(key)

    for row in ds['CognatesetTable']:
        cc = data.add(
            models.CognateClass,
            row['ID'],
            id=row['ID'],
            name=row['ID'],
            root_form=row['Root_Form_calc'] if row['Root_Form_calc'] is not None and len(row['Root_Form_calc']) else row['Root_Form'],
            root_form_calc=row['Root_Form_calc'] or None,
            root_gloss=row['Root_Gloss'] or None,
            root_language=row['Root_Language_calc'] if row['Root_Language_calc'] is not None and len(row['Root_Language_calc']) else row['Root_Language'],
            root_language_calc=row['Root_Language_calc'] or None,
            comment=re_links.sub(parse_links, row['Comment'] or ''),
            justification=re_links.sub(parse_links, row['Justification'] or ''),
            ideophonic=row['Ideophonic'] or None,
            parallel_derivation=row['parallelDerivation'] or None,
            revised_by=','.join(row['revised_by']) or None,
            superset_id=int(row['supersetid']) if row['supersetid'] else None,
        )
        for src in row['Source']:
            sid, pages = ds.sources.parse(src)
            if pages:
                pages = pages.replace('|', ';')
            DBSession.add(clld_cognacy_plugin.models.CognatesetReference(
                cognateset=cc, source=data['Source'][sid], description=pages))

    DBSession.flush()

    cc_id_pk_map = {str(ccid): cc.pk for ccid, cc in data['CognateClass'].items()}
    for row in ds['CognatesetTable']:
        if row['proposedAsCognateTo_pk']:
            DBSession.add(models.ProposedCognates(
                cc1_pk=data['CognateClass'][row['ID']].pk,
                cc2_pk=cc_id_pk_map[str(row['proposedAsCognateTo_pk'])],
                scale=row['proposedAsCognateToScale']
            ))
    DBSession.flush()

    loans = {ln['Cognateset_ID']: ln for ln in ds['loans.csv']}
    for ccid, cc in data['CognateClass'].items():
        if ccid in loans:
            le = loans[ccid]
            if le['SourceCognateset_ID']:
                cc.loan_source_pk = data['CognateClass'][le['SourceCognateset_ID']].pk
            else:
                cc.loan_source_pk = None
            cc.loan_notes = le['Comment']
            cc.loan_source_languoid = le['Source_languoid']
            cc.loan_source_form = le['Source_form']
            cc.parallel_loan_event = le['Parallel_loan_event']
            cc.is_loan = True

    for row in ds['CognateTable']:
        cc = data['CognateClass'][row['Cognateset_ID']]
        if cc.meaning_pk is None:
            cc.meaning_pk = data['Lexeme'][row['Form_ID']].valueset.parameter_pk
        else:
            assert data['Lexeme'][row['Form_ID']].valueset.parameter_pk == cc.meaning_pk
        data.add(
            clld_cognacy_plugin.models.Cognate,
            row['ID'],
            cognateset=data['CognateClass'][row['Cognateset_ID']],
            counterpart=data['Lexeme'][row['Form_ID']],
            doubt=row['Doubt'],
        )

    l_by_gc = {}
    for s in DBSession.query(models.Variety):
        l_by_gc[s.glottocode] = s.pk

    tree = Phylogeny(
        id='1',
        name='Bouckaert et al.',
        description='',
        newick=Path.read_text(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'newick.txt'),
    )
    for k, taxon in enumerate(reader(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'taxa.csv', namedtuples=True)):
        label = TreeLabel(
            id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1),
            name=taxon.taxon,
            phylogeny=tree,
            description=taxon.glottocode)
        if taxon.glottocode in l_by_gc:
            LanguageTreeLabel(language_pk=l_by_gc[taxon.glottocode], treelabel=label)
    DBSession.add(tree)

    l_by_ascii = {}
    for s in DBSession.query(models.Variety):
        l_by_ascii[s.ascii_name] = s.pk

    tree = Phylogeny(
        id='2',
        name='CoBL consensu',
        description='',
        newick=Path.read_text(data_file_path / 'raw' / 'ie122' / 'newick.txt'),
    )
    for k, taxon in enumerate(reader(data_file_path / 'raw' / 'ie122' / 'taxa.csv', namedtuples=True)):
        label = TreeLabel(
            id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1),
            name=taxon.taxon,
            phylogeny=tree)
        if taxon.taxon in l_by_ascii:
            LanguageTreeLabel(language_pk=l_by_ascii[taxon.taxon], treelabel=label)
    DBSession.add(tree)
Example #17
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Example #18
0
def main(args):

    Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\
        .create(DBSession.bind)

    data = Data()

    dataset = common.Dataset(
        id=numerals.__name__,
        name="Numeralbank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain="numerals.clld.org",
        jsondata={
            "license_icon": "cc-by.png",
            "license_name": "Creative Commons Attribution 4.0 International License",
        },
    )

    DBSession.add(dataset)

    for i, (id_, name) in enumerate(
        [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")]
    ):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    DBSession.add(dataset)

    # Take meta data from curated CLDF data set
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')
    # Parameters:
    for parameter in ds["ParameterTable"]:
        data.add(
            models.NumberParameter,
            parameter["ID"],
            id=parameter["ID"],
            name="{0}".format(parameter["ID"]),
            concepticon_id=parameter['Concepticon_ID'],
        )
    basis_parameter = data.add(
        models.NumberParameter,
        "0",
        id="0",
        name="Base",
    )
    load_family_langs = []
    for language in ds["LanguageTable"]:
        lang = data.add(
            models.Variety,
            language["ID"],
            id=language["ID"],
            name=language["Name"],
            latitude=language["Latitude"],
            longitude=language["Longitude"],
            creator=language["Contributor"],
            comment=language["Comment"],
            url_soure_name=language["SourceFile"],
        )
        if language["Glottocode"]:
            load_family_langs.append((language["Glottocode"], lang))

    # get orginal forms
    ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json')
    org_forms = {f["ID"]: f for f in ds["FormTable"]}

    d = data_repos[1]
    contrib = data.add(
        common.Contribution,
        d['id'],
        id=d['id'],
        name=d['name']
    )

    # process curated forms
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')

    # Add Base info if given
    for language in ds["LanguageTable"]:
        if language["Base"]:
            basis = language["Base"]
            de = data["DomainElement"].get(basis)
            if not de:
                de = data.add(
                    common.DomainElement,
                    basis,
                    id=text_type(basis),
                    name=text_type(basis),
                    parameter=basis_parameter,
                )
            vs = data.add(
                common.ValueSet,
                data["Variety"][language["ID"]].id,
                id=data["Variety"][language["ID"]].id,
                language=data["Variety"][language["ID"]],
                parameter=basis_parameter,
                contribution=contrib,
            )

            common.Value(
                id=data["Variety"][language["ID"]].id,
                valueset=vs,
                domainelement=de
            )

    # Forms:
    for form in ds["FormTable"]:
        valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"])
        valueset = data["ValueSet"].get(valueset_id)

        # Unless we already have something in the VS:
        if not valueset:
            if form["Language_ID"] in data["Variety"]:
                vs = data.add(
                    common.ValueSet,
                    valueset_id,
                    id=valueset_id,
                    language=data["Variety"][form["Language_ID"]],
                    parameter=data["NumberParameter"][form["Parameter_ID"]],
                    contribution=contrib,
                )

        org_form = ""
        if form["ID"] in org_forms:
            if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]:
                org_form = org_forms[form["ID"]]["Form"]
        else:
            org_form = "no original form"
        DBSession.add(
            models.NumberLexeme(
                id=form["ID"],
                name=form["Form"],
                comment=form["Comment"],
                is_loan=form["Loan"],
                other_form=form["Other_Form"],
                org_form=org_form,
                is_problematic=form["Problematic"],
                valueset=vs,
            )
        )

    load_families(
        Data(),
        load_family_langs,
        glottolog_repos=gl_repos,
        strict=False,
    )

    distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all()
    families = dict(
        zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties)))
    )

    for l in DBSession.query(models.Variety):
        l.jsondata = {"color": families[l.family_pk]}

    p = common.Parameter.get("0")
    colors = color.qualitative_colors(len(p.domain))

    for i, de in enumerate(p.domain):
        de.jsondata = {"color": colors[i]}
Example #19
0
def load_ecoregions(filter=None):
    """

    :param data:
    :param filter:
    :return:
    """
    ecoregions = jsonlib.load(
        pathlib.Path(pytsammalex.__file__).parent /
        'ecoregions.json')['features']

    biome_map = {
        1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'),
        2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'),
        3: ('Tropical & Subtropical Coniferous Forests', ''),
        4: ('Temperate Broadleaf & Mixed Forests', ''),
        5: ('Temperate Conifer Forests', ''),
        6: ('Boreal Forests/Taiga', ''),
        7:
        ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'),
        8: ('Temperate Grasslands, Savannas & Shrublands', ''),
        9: ('Flooded Grasslands & Savannas', '0265fe'),
        10: ('Montane Grasslands & Shrublands', 'cdffcc'),
        11: ('Tundra', ''),
        12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'),
        13: ('Deserts & Xeric Shrublands', 'feff99'),
        14: ('Mangroves', '870083'),
    }

    data = Data()
    for eco_code, features in itertools.groupby(
            sorted(ecoregions, key=lambda e: e['properties']['eco_code']),
            key=lambda e: e['properties']['eco_code']):
        features = list(features)
        props = features[0]['properties']
        if filter and not filter(eco_code, props):
            continue

        if int(props['BIOME']) not in biome_map:
            continue

        biome = data['Biome'].get(props['BIOME'])
        if not biome:
            name, color = biome_map[int(props['BIOME'])]
            biome = data.add(Biome,
                             props['BIOME'],
                             id=str(int(props['BIOME'])),
                             name=name,
                             description=color or 'ffffff')
        centroid = (None, None)
        f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1]
        if f['geometry']:
            coords = f['geometry']['coordinates'][0]
            if f['geometry']['type'] == 'MultiPolygon':
                coords = coords[0]
            centroid = get_center(coords)

        polygons = nfilter([_f['geometry'] for _f in features])
        data.add(Ecoregion,
                 eco_code,
                 id=eco_code,
                 name=props['ECO_NAME'],
                 description=props['G200_REGIO'],
                 latitude=centroid[1],
                 longitude=centroid[0],
                 biome=biome,
                 area=props['area_km2'],
                 gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])],
                 realm=Ecoregion.realm_map[props['REALM']],
                 jsondata=dict(polygons=polygons))
Example #20
0
def main(args):  # pragma: no cover
    data = Data()
    clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data'
    clts_repos = CLTS(clts_repos)
    print(clts_repos.repos)
    version = 'v2.1.0' # assert_release(clts_repos.repos)

    for rec in Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    dataset = common.Dataset(
        id='clts',
        name="CLTS {0}".format(version),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate([
        'Johann-Mattis List',
        'Cormac Anderson',
        'Tiago Tresoldi',
        'Robert Forkel',
    ]):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for line in args.cldf['data/features.tsv']:
        data.add(
            models.Feature,
            line['ID'],
            id=line['ID'],
            name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']),
            sound_type=line['TYPE'],
            feature=line['FEATURE'],
            value=line['VALUE'],
        )

    DBSession.add(models.SoundSegment(
        id='NA',
        name='<NA>',
        description='<NA>',
        type='marker',
        generated=True,
        unicode='',
        color='#bbbbbb',
    ))
    for line in args.cldf['data/sounds.tsv']:
        s = data.add(
            models.SoundSegment,
            line['ID'],
            id=line['ID'],
            name=line['GRAPHEME'],
            description=line['NAME'],
            type=line['TYPE'],
            generated=line['GENERATED'],
            unicode=' / '.join(line['UNICODE']),
            color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']),
        )
        if s.color == '0':
            s.color = '#bbbbbb'
        assert s.color in LEGEND
    DBSession.flush()

    seen = set()
    for line in args.cldf['data/sounds.tsv']:
        for fid in line['FEATURES']:
            spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk
            if (spk, fpk) not in seen:
                DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk))
                seen.add((spk, fpk))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    for line in args.cldf['sources/index.tsv']:
        c = data.add(
            models.Transcription,
            line['NAME'],
            id=line['NAME'],
            name=line['NAME'],
            description=line['DESCRIPTION'].replace(':bib:', '/sources/'),
            datatype=getattr(models.Datatype, line['TYPE'])
        )
        for ref in line.get('REFS', []):
            common.ContributionReference(source=data['Source'][ref], contribution=c)

    sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl
    image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl

    for line in args.cldf['data/graphemes.tsv']:
        key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME']
        if key not in data['Grapheme']:
            sound_id = line['NAME'].replace(' ', '_')
            vs = data['ValueSet'].get((line['DATASET'], line['NAME']))
            if not vs:
                try:
                    vs = data.add(
                        common.ValueSet,
                        (line['DATASET'], line['NAME']),
                        id=key,
                        description=line['NAME'],
                        language=english,
                        contribution=data['Transcription'][line['DATASET']],
                        parameter=data['SoundSegment'][sound_id]
                    )
                except:
                    print(line)
                    raise
            data.add(
                models.Grapheme,
                key,
                id=key,
                name=line['GRAPHEME'],
                description=line['NAME'],
                url=line['URL'].unsplit() if line['URL'] else None,
                audio=sound_url_template.expand(line) if line['SOUND'] else None,
                image=image_url_template.expand(line) if line['IMAGE'] else None,
                valueset=vs
            )
Example #21
0
def main(args):
    data = Data()
    print(args.data_file('x'))

    dataset = common.Dataset(
        id=grammaticon.__name__,
        name="Grammaticon",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grammaticon.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, ed in enumerate(['Martin Haspelmath', 'Robert Forkel']):
        common.Editor(dataset=dataset, contributor=get_contributor(data, ed), ord=i + 1)

    eng = data.add(common.Language, 'eng', name='English')

    for obj in reader(args.data_file('Feature_lists.csv'), dicts=True):
        contrib = data.add(
            models.Featurelist, obj['id'],
            id=slug(obj['name']),
            name=obj['name'],
            year=obj['year'],
            number_of_features=int(obj['number of features']) if obj['number of features'] else None,
            url=obj['year'])
        if obj['authors']:
            for i, author in enumerate(obj['authors'].split(',')):
                common.ContributionContributor(
                    contribution=contrib,
                    contributor=get_contributor(data, author),
                    ord=i + 1)

    #id,name,feature_area
    for name, objs in itertools.groupby(
            sorted(reader(args.data_file('Metafeatures.csv'), dicts=True), key=lambda i: i['name']),
            lambda i: i['name']):
        dbobj = None
        for obj in objs:
            if not dbobj:
                dbobj = data.add(
                    models.Metafeature, obj['id'],
                    id=slug(obj['id']), name=obj['name'], area=obj['feature_area'])
            else:
                data['Metafeature'][obj['id']] = dbobj

    DBSession.flush()
    #feature_ID,feature name,feature description,meta_feature_id,collection_id,collection URL,collection numbers
    for obj in reader(args.data_file('Features.csv'), dicts=True):
        if int(obj['collection_id']) == 8:
            obj['collection_id'] = '1'
        if (not obj['meta_feature_id']):  #or obj['meta_feature_id'] in ('89'):
            print('skipping: {}'.format(obj))
            continue
        vsid = (data['Featurelist'][obj['collection_id']].pk, data['Metafeature'][obj['meta_feature_id']].pk)
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet, vsid,
                id='{0}-{1}'.format(*vsid),
                contribution=data['Featurelist'][obj['collection_id']],
                parameter=data['Metafeature'][obj['meta_feature_id']],
                language=eng)
        models.Feature(
            valueset=vs, id=slug(obj['feature_ID']), name=obj['feature name'], description=obj['feature description'])

    for obj in reader(args.data_file('Concepts.csv'), dicts=True):
        data.add(
            models.Concept, obj['id'],
            id=obj.pop('id'), name=obj.pop('label'), description=obj.pop('definition'),
            **{k.replace(' ', '_'): v for k, v in obj.items()})

    for obj in reader(args.data_file('Concepts_metafeatures.csv'), dicts=True):
        if obj['meta_feature__id'] in ('89',):
            print('skipping: {}'.format(obj))
            continue
        if obj['concept_id'] and obj['meta_feature__id']:
            models.ConceptMetafeature(
                concept=data['Concept'][obj['concept_id']],
                metafeature=data['Metafeature'][obj['meta_feature__id']])

    for obj in reader(args.data_file('Concepthierarchy.csv'), dicts=True):
        child = data['Concept'].get(obj['concept_id'])
        if child:
            parent = data['Concept'].get(obj['concept_parent_id'])
            if parent:
                DBSession.add(models.ConceptRelation(parent=parent, child=child))
Example #22
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        jambu.__name__,
        id=jambu.__name__,
        name='Jambu',
        domain='jambu-clld.herokuapp.com',
        publisher_name="Georgetown University",
        publisher_place="Washington",
        publisher_url="http://gucl.georgetown.edu/",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(['Aryaman Arora']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    print("Languages...")
    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name',
                          'glottocode', 'longitude', 'latitude', 'Clade'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            family=lang['Clade'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    print("Cognates...")
    for cognate in iteritems(args.cldf, 'CognateTable'):
        # print(cognate)
        data.add(models.Cognate_,
                 cognate['Cognateset_ID'],
                 name=cognate['Form'],
                 language=cognate['Language_ID'],
                 description=cognate['Description'])

    counts = collections.defaultdict(set)
    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        counts[form['parameterReference']].add(form['languageReference'])

    print("Params...")
    for param in tqdm(
            iteritems(args.cldf, 'ParameterTable', 'ID', 'Name',
                      'Concepticon_ID', 'Description')):
        data.add(models.Concept,
                 param['ID'],
                 id=param['ID'],
                 name='{} [{}]'.format(param['Name'], param['ID']),
                 description=param['Description'],
                 count=len(counts[param['ID']]))

    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        l = re.split(r";|\+", form['parameterReference'])
        for i, paramref in enumerate(l):
            if paramref == '?': continue
            vsid = (form['languageReference'], paramref)
            vs = data['ValueSet'].get(vsid)
            if not vs:
                vs = data.add(
                    common.ValueSet,
                    vsid,
                    id='-'.join(vsid),
                    language=data['Variety'][form['languageReference']],
                    parameter=data['Concept'][paramref],
                    contribution=contrib,
                )

            for ref in form.get('source', []):
                sid, pages = Sources.parse(ref)
                refs[(vsid, sid)].append(pages)

            data.add(
                models.Lexeme,
                form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                name=form['form'],
                gloss=form['Gloss'],
                native=form['Native'],
                phonemic='/' + form['Phonemic'] +
                '/' if form['Phonemic'] else None,
                description=form['Description'],
                cognateset=form['Cognateset'],
                valueset=vs,
            )

    print("Refs...")
    for (vsid, sid), pages in tqdm(refs.items()):
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
Example #23
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    data.add(
        common.Dataset,
        polyglottaafricana.__name__,
        id=polyglottaafricana.__name__,
        domain='',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
        )
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            common.Value,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Example #24
0
def main(args):
    data = Data()
    ds = Pofatu(
        pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data')

    dataset = common.Dataset(
        id=pofatu.__name__,
        name="POFATU",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='pofatu.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([
        ('hermannaymeric', 'Aymeric Hermann'),
        ('forkelrobert', 'Robert Forkel'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for rec in ds.iterbib():
        rec.genre = bibtex.EntryType.from_string(
            ENTRY_TYPES.get(rec.genre, rec.genre))
        if 'date' in rec:
            rec['year'] = rec.pop('date')
        data.add(common.Source,
                 rec.id,
                 _obj=bibtex2source(rec, lowercase_id=False))

    analyses = list(ds.iterdata())

    def midpoint(coords):
        p = MultiPoint([(lat, lon + 360 if lon < 0 else lon)
                        for lat, lon in coords]).convex_hull
        #geojson = {
        #    'type': 'Feature',
        #    'properties': {},
        #    'geometry': mapping(p)}
        c = p.centroid
        return c.x, (c.y - 360) if c.y > 180 else c.y

    artefacts = collections.defaultdict(dict)
    midpoints = {}
    for a in analyses:
        l = a.sample.location
        lid = l.id
        if lid not in midpoints:
            midpoints[lid] = set()
        if l.latitude is not None and l.longitude is not None:
            midpoints[lid].add((l.latitude, l.longitude))
        art = a.sample.artefact
        for attr_ in ['name', 'category', 'collection_type']:
            if not artefacts[slug(art.id)].get(attr_):
                artefacts[slug(art.id)][attr_] = getattr(art, attr_)

    midpoints = {
        k: midpoint(v) if v else (None, None)
        for k, v in midpoints.items()
    }

    for analysis in analyses:
        loc = analysis.sample.location
        if loc.id not in data['Location']:
            data.add(
                models.Location,
                loc.id,
                id=valid_id(loc.id),
                name=loc.label,
                latitude=midpoints[loc.id][0],
                longitude=midpoints[loc.id][1],
                region=loc.region.replace('_', ' '),
                subregion=loc.subregion,
                location=loc.locality,
            )

    # Add contributions
    for contrib in ds.itercontributions():
        contribution = data.add(
            common.Contribution,
            contrib.id,
            id=valid_id(contrib.id),
            name=contrib.label,
            description=contrib.description,
        )
        DBSession.flush()
        for i, name in enumerate(contrib.contributors):
            cid = slug(name)
            co = data['Contributor'].get(cid)
            if not co:
                co = data.add(common.Contributor, cid, id=cid, name=name)
            common.ContributionContributor(ord=i,
                                           contribution=contribution,
                                           contributor=co)

        for ref in contrib.source_ids:
            DBSession.add(
                common.ContributionReference(
                    contribution=contribution,
                    source=data['Source'][ref],
                ))
            data['Contribution'][ref] = contribution

    methods = collections.defaultdict(list)
    for method in ds.itermethods():
        m = data.add(
            models.Method,
            method.id,
            id=valid_id(method.id),
            name=method.label,
            code=method.code,
            parameter=method.parameter.strip(),
            instrument=method.instrument,
            number_of_replicates=method.number_of_replicates,
            date=method.date,
            comment=method.comment,
            detection_limit=method.detection_limit,
            detection_limit_unit=method.detection_limit_unit,
            total_procedural_blank_value=method.total_procedural_blank_value,
            total_procedural_unit=method.total_procedural_unit,
        )
        methods[(m.code.lower(), m.parameter.lower())].append(m)
        for ref in method.references:
            DBSession.add(
                models.MethodReference(
                    method=m,
                    sample_name=ref.sample_name,
                    sample_measured_value=ref.sample_measured_value,
                    uncertainty=ref.uncertainty,
                    uncertainty_unit=ref.uncertainty_unit,
                    number_of_measurements=ref.number_of_measurements,
                ))
        for ref in method.normalizations:
            DBSession.add(
                models.Normalization(
                    method=m,
                    reference_sample_name=ref.reference_sample_name,
                    reference_sample_accepted_value=ref.
                    reference_sample_accepted_value,
                    citation=ref.citation,
                ))

    parameter = data.add(common.Parameter,
                         'c',
                         id='category',
                         name='Sample category')
    for i, opt in enumerate(attr.fields_dict(
            pypofatu.models.Sample)['sample_category'].validator.options,
                            start=1):
        data.add(common.DomainElement,
                 opt,
                 parameter=parameter,
                 id=str(i),
                 name=opt)

    DBSession.flush()
    assert parameter.pk

    # Add Samples and UnitParameters and Measurements
    for analysis in analyses:
        sample = analysis.sample
        vsid = '{0}-{1}'.format(sample.location.id,
                                data['Contribution'][sample.source_id].id)
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=valid_id(vsid),
                language_pk=data['Location'][sample.location.id].pk,
                parameter_pk=parameter.pk,
                contribution_pk=data['Contribution'][sample.source_id].pk,
            )
        v = data['Sample'].get(sample.id)
        if not v:
            v = data.add(
                models.Sample,
                sample.id,
                id=valid_id(sample.id),
                name=sample.id,
                sample_name=sample.sample_name,
                sample_comment=sample.sample_comment,
                petrography=sample.petrography,
                latitude=sample.location.latitude,
                longitude=sample.location.longitude,
                elevation=sample.location.elevation,
                location_comment=sample.location.comment,
                site_name=sample.site.name,
                site_code=sample.site.code,
                site_context=sample.site.context,
                site_comment=sample.site.comment,
                site_stratigraphic_position=sample.site.stratigraphic_position,
                site_stratigraphy_comment=sample.site.stratigraphy_comment,
                domainelement=data['DomainElement'][sample.sample_category],
                valueset=vs,
                artefact_id=sample.artefact.id,
                artefact_name=sample.artefact.name,
                artefact_category=sample.artefact.category,
                artefact_comment=sample.artefact.comment,
                artefact_attributes=sample.artefact.attributes,
                artefact_collector=sample.artefact.collector,
                artefact_collection_type=sample.artefact.collection_type,
                artefact_collection_location=sample.artefact.
                collection_location,
                artefact_collection_comment=sample.artefact.collection_comment,
                artefact_fieldwork_date=sample.artefact.fieldwork_date,
            )
            DBSession.add(
                models.SampleReference(
                    description='sample',
                    sample=v,
                    source=data['Source'][sample.source_id]))
            for ref in sample.artefact.source_ids:
                DBSession.add(
                    models.SampleReference(description='artefact',
                                           sample=v,
                                           source=data['Source'][ref]))
            for ref in sample.site.source_ids:
                DBSession.add(
                    models.SampleReference(description='site',
                                           sample=v,
                                           source=data['Source'][ref]))

        a = data.add(
            models.Analysis,
            analysis.id,
            id=better_slug(analysis.id),
            name=analysis.id,
            sample=v,
        )

        for i, measurement in enumerate(analysis.measurements):
            if i == 0:
                method = measurement.method
                if method:
                    a.analyzed_material_1 = method.analyzed_material_1,
                    a.analyzed_material_2 = method.analyzed_material_2,
                    a.sample_preparation = method.sample_preparation,
                    a.chemical_treatment = method.chemical_treatment,
                    a.technique = method.technique,
                    a.laboratory = method.laboratory,
                    a.analyst = method.analyst,

            pid = slug(measurement.parameter, lowercase=False)
            p = data['Param'].get(pid)
            if not p:
                p = data.add(models.Param,
                             pid,
                             id=pid,
                             name=measurement.parameter)
            data.add(
                models.Measurement,
                None,
                id='{0}-{1}'.format(a.id, p.id),
                analysis=a,
                method=data['Method'].get(measurement.method.id)
                if measurement.method else None,
                value=measurement.value,
                less=measurement.less,
                precision=measurement.value_sd,
                sigma=measurement.sd_sigma,
                unitparameter=p,
            )
Example #25
0
def main(args):  # pragma: no cover
    #
    # FIXME: more generic:
    # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld!
    # - Store datasets in defaultdict(list) keyed with module
    #
    datasets = {}
    for ds in iter_datasets(args.cldf.directory):
        datasets[ds.module] = ds

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    thedataset = data.add(
        common.Dataset,
        hindukush.__name__,
        id=hindukush.__name__,
        name='Hindu Kush Areal Typology',
        domain='hindukush.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )
    for i, name in enumerate(
        ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']):
        common.Editor(dataset=thedataset,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent /
                                         'HK_website.bib',
                                         lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for module, ds in sorted(datasets.items(), key=lambda i: i[0]):
        for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name',
                                 'latitude', 'longitude'):
            if lang['id'] not in data['Variety']:
                data.add(
                    models.Variety,
                    lang['id'],
                    id=lang['id'],
                    name=lang['name'],
                    latitude=lang['latitude'],
                    longitude=lang['longitude'],
                    glottocode=lang['glottocode'],
                    subgroup=lang['SubGroup'],
                    location=lang['Location'],
                    elicitation=lang['Elicitation'],
                    jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])),
                )

        contrib = data.add(
            models.CLDFDataset,
            module,
            id=module,
            name='{} [{}]'.format(ds.properties.get('dc:title'), module),
            description=ds.properties.get('dc:bibliographicCitation'),
            module=module,
        )

        if module == 'Wordlist':
            for param in ds.iter_rows('ParameterTable', 'id',
                                      'concepticonReference', 'name'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name='{} [{}]'.format(param['name'], param['id']),
                    sortkey=param['id']
                    if not param['id'].startswith('Numerals') else
                    'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])),
                    concepticon_id=param['concepticonReference'],
                    contribution=contrib,
                    category=param['domain'] or 'ASJPlist',
                )

            audio = {
                r['ID']: r
                for r in ds.iter_rows('media.csv')
                if r['mimetype'] == 'audio/mpeg'
            }
            for form in ds.iter_rows('FormTable', 'id', 'form',
                                     'languageReference', 'parameterReference',
                                     'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                mp3 = next(
                    iter([
                        audio[aid] for aid in form['Audio_Files']
                        if aid in audio
                    ]), None)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['form'],
                    valueset=vs,
                    jsondata=dict(audio=ds.get_row_url('media.csv', mp3
                                                       ) if mp3 else None),
                )
        elif module == 'StructureDataset':
            for param in ds.iter_rows('ParameterTable', 'id', 'name',
                                      'description'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name=param['name'],
                    description=html(param['description'])
                    if param['description'] else None,
                    category=param['Category'],
                    contribution=contrib,
                )
            for code in ds.iter_rows('CodeTable', 'id', 'name', 'description',
                                     'parameterReference'):
                data.add(common.DomainElement,
                         code['id'],
                         id=code['id'],
                         name=code['name'],
                         description=code['description'],
                         parameter=data['Param'][code['parameterReference']],
                         jsondata={
                             'color': {
                                 'absent': 'ff0000',
                                 'present': '0000ff',
                                 'indeterminate': 'cccccc',
                             }.get(code['description'])
                         })
            #
            # FIXME: read CodeTable!
            #
            for form in ds.iter_rows('ValueTable', 'id', 'value',
                                     'languageReference', 'parameterReference',
                                     'codeReference', 'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['value'],
                    valueset=vs,
                    domainelement=data['DomainElement'][form['codeReference']])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )