Exemple #1
0
def main(args):  # pragma: no cover
    data = Data()

    print("Setting up dataset…")
    dataset = common.Dataset(
        id=cariban.__name__,
        domain="cariban.clld.org",
        name="Comparative Cariban Database",
        description="Comparative Cariban Database",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_url="https://www.eva.mpg.de",
        publisher_place="Leipzig",
        license="https://creativecommons.org/licenses/by/4.0/",
        contact="*****@*****.**",
        jsondata={'function_paradigms': []},
    )

    fps = []
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            for cons in morph_func["Construction"]:
                fps.append({
                    'Function': function,
                    'Construction': cons,
                    'Morpheme': morph_func['Morpheme']})
    dataset.update_jsondata(function_paradigms=fps)

    DBSession.add(dataset)
    DBSession.flush()

    print("Adding contributors…")
    c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/")
    dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True))

    print("Adding languages…")
    dialect_mapping = {}
    lang_shorthands = {}
    glottocodes = {}
    lang_ids = {}
    for lang in args.cldf["LanguageTable"]:
        if lang["Sampled"] == "y":
            language = data.add(
                common.Language,
                lang["ID"],
                id=lang["ID"],
                name=lang["Name"],
                latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None,
                longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None,
                jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']},
            )
            add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"])
        if lang["Dialect_Of"] not in [None, "y"]:
            dialect_mapping[lang["ID"]] = lang["Dialect_Of"]
        lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]}
        glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}
        lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}

    def get_lang_id(key):
        if key in lang_shorthands:
            lang_id = lang_shorthands[key]["ID"]
        elif key in glottocodes:
            lang_id = glottocodes[key]["ID"]
        elif key in lang_ids:
            lang_id = key
        else:
            print("Could not identify language %s" % key)
            return None
        if lang_id in dialect_mapping:
            lang_id = dialect_mapping[lang_id]
        return lang_id

    def get_key_and_page(source_string):
        if len(source_string.split("[")) > 1:
            bib_key = source_string.split("[")[0]
            pages = source_string.split("[")[1].split("]")[0]
        else:
            bib_key = source_string
            pages = ""
        return bib_key, pages

    print("Adding sources…")
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    print("Adding language sources…")
    DBSession.flush()
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        if "keywords" in rec:
            for keyword in rec["keywords"].split(","):
                if keyword in lang_shorthands:
                    lang_id = get_lang_id(keyword.strip(" "))
                    if lang_id in data["Language"]:
                        data.add(common.LanguageSource,
                        rec.id+lang_id,
                        language_pk=data["Language"][lang_id].pk,
                        source_pk=data["Source"][rec.id].pk
                        )
        
    data.add(
        common.Source,
        "pc",
        id="pc",
        name="Personal communication",
        description="Placeholder for data obtained from personal communication.",
        bibtex_type=bibtex.EntryType.misc
    )

#     print("Adding glossing abbreviations…")
#     length = len(pynterlinear.get_all_abbrevs().keys())
#     for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()):
#         print("%s/%s" % (i+1, length), end="\r")
#         DBSession.add(common.GlossAbbreviation(id=key, name=name))
#     print("")
#
    print("Adding examples…")
    gloss_replacements = {
        "S_A_": "Sa",
        "S_P_": "Sp"
    }
    def clldify_glosses(gloss_line):
        for orig, new in gloss_replacements.items():
            gloss_line = gloss_line.replace(orig,new)
        gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line)
        return gloss_line

    for ex in args.cldf["ExampleTable"]:
        lang_id = get_lang_id(ex["Language_ID"])
        new_ex = data.add(common.Sentence,
            ex["ID"],
            id=ex["ID"],
            name=ex["Name"],
            description=ex["Translated_Text"],
            analyzed="\t".join(ex["Analyzed_Word"]),
            gloss=clldify_glosses("\t".join(ex["Gloss"])),
            language=data["Language"][lang_id],
            comment=ex["Comment"],
            markup_gloss="\t".join(ex["Morpheme_IDs"])
        )
        
        if ex["Source"]:
            bib_key, pages = get_key_and_page(ex["Source"])
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(common.SentenceReference(
                    sentence=new_ex,
                    source=source,
                    key=source.id,
                    description=pages.replace("--","–"))
                )

    def add_morpheme_reference(morpheme, source_string):
        bib_key, pages = get_key_and_page(source_string)
        if bib_key in data["Source"]:
            source = data["Source"][bib_key]
            DBSession.add(models.MorphemeReference(
                morpheme=morpheme,
                source=source,
                key=source.id,
                description=pages.replace("--","–")
                )
            )

    print("Adding morphemes…")
    for morph in args.cldf["FormTable"]:
        lang_id = get_lang_id(morph["Language_ID"])
        form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ")
        new_morph = data.add(models.Morpheme,
            morph["ID"],
            morpheme_type="grammatical",
            language=data["Language"][lang_id],
            name="/".join(form),
            id=morph["ID"],
        )
        
        if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0])

    print("Adding constructions…")
    data.add(models.DeclarativeType, "imp", id="imp", name="imperative")
    data.add(models.DeclarativeType, "decl", id="decl", name="declarative")
    data.add(models.MainClauseVerb, "y", id="y", name="main clause construction")
    data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction")

    for cons in args.cldf["ParameterTable"]:
        lang_id = get_lang_id(cons["Language_ID"])
        new_construction = data.add(
            models.Construction,
            cons["ID"],
            id=cons["ID"],
            language=data["Language"][lang_id],
            name=cons["Description"],
            mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]],
        )
        if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]]

    def add_morph_func(morpheme, func_key, construction):
        data.add(models.MorphemeFunction,
            "%s:%s" % (morpheme, function),
            id="%s:%s" % (morpheme, func_key),
            name="MorphemeFunction %s:%s"% (morpheme, func_key),
            unit=data["Morpheme"][morpheme],
            unitparameter=data["Meaning"][function],
            construction=construction
        )

    print("Adding morpheme functions…")
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            func_key = function.replace(".","_")
            if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"):
                meaning_type="inflectional"
            else:
                meaning_type="derivational"
            if function not in data["Meaning"]:
                data.add(models.Meaning,
                    function,
                    id=func_key,
                    name=function,
                    meaning_type=meaning_type
                )
            #Only some morpheme functions are specified as occurring in specific constructions
            if len(morph_func["Construction"]) == 0:
                for morpheme in morph_func["Morpheme"]:
                    add_morph_func(morpheme, func_key, None)
            else:
                for construction in morph_func["Construction"]:
                    if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?":
                        for morpheme in morph_func["Morpheme"]:
                            if data["Morpheme"][morpheme].language != data["Construction"][construction].language:
                                print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % (
                                data["Morpheme"][morpheme].language,
                                data["Morpheme"][morpheme],
                                data["Construction"][construction].language,
                                data["Construction"][construction]
                                )
                                )
                            cons_func_key = func_key + ":" + construction
                            add_morph_func(morpheme, cons_func_key, data["Construction"][construction])

    print("Checking examples for illustrated morphemes…")
    proto_languages = ["pc"]
    is_illustrated = {}
    for key, row in data["MorphemeFunction"].items():
        if row.unit.language.id in proto_languages: continue
        is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False
    for row in args.cldf["ExampleTable"]:
        for word in row["Morpheme_IDs"]:
            morph_ids = util.split_word(word)
            for unit_value in morph_ids:
                if unit_value in ["X","-","=", "~"]:
                    continue
                unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"])
                if unitvaluesentence_key in data["UnitValueSentence"].keys():
                    continue
                is_illustrated[unit_value] = True
                morph_id = unit_value.split(":")[0]
                if morph_id not in data["Morpheme"].keys():
                    print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id))
                elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language:
                    print("Warning: The %s example %s claims to contain the %s morpheme %s." % (
                        data["Sentence"][row["ID"]].language,
                        row["ID"],
                        data["Morpheme"][morph_id].language,
                        data["Morpheme"][morph_id]
                    )
                    )
                if ":" not in unit_value:
                    print("%s in %s contains no defined function!" % (unit_value, row["ID"]))
                function = unit_value.split(":")[1]
                morph_function_id = "%s:%s" % (morph_id, function)
                if morph_function_id not in data["MorphemeFunction"].keys():
                    print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-")))
                    continue
                data.add(models.UnitValueSentence,
                unitvaluesentence_key,
                sentence=data["Sentence"][row["ID"]],
                unitvalue=data["MorphemeFunction"][morph_function_id],
                )


    # see how many morpheme functions are illustrated with example sentences
    good_ill = [key for key, value in is_illustrated.items() if value]
    not_ill = [key for key, value in is_illustrated.items() if not value]
    not_ill.sort()
    cov = len(good_ill)/len(is_illustrated)*100
    print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2)))
    f = open("../unillustrated_morphemes.txt", "w")
    for morph in not_ill:
        f.write(morph+"\n")
    f.close()

    print("Adding cognate sets…")
    for cogset in args.cldf["CognatesetTable"]:
        new_cset = data.add(models.Cognateset,
            cogset["ID"],
            id=cogset["ID"],
            name=cogset["Name"],
            description=cogset["Function"],
            cogset_type="grammatical"
        )
        if cogset["Source"]:
            for source in cogset["Source"]:
                bib_key, pages = get_key_and_page(source)
                if bib_key in data["Source"]:
                    source = data["Source"][bib_key]
                    DBSession.add(models.CognatesetReference(
                        cognateset=new_cset,
                        source=source,
                        key=source.id,
                        description=pages)
                        )

    print("Adding cognates…")
    for morph in args.cldf["FormTable"]:
        for cognate_ID in morph["Cognateset_ID"]:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognate_ID],
                    counterpart=data["Morpheme"][morph["ID"]]
                    )
            )

    print("Adding morpheme comments…")
    for row in args.cldf["FormTable"]:
        data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"])

    print("Adding construction descriptions…")
    for cons in args.cldf["ParameterTable"]:
        if cons["Comment"] is None:
            description = ""
        else:
            description = util.generate_markup(cons["Comment"])
        description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"]))
        description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"]))
        data["Construction"][cons["ID"]].markup_description = description


    print("Adding cognate set descriptions…")
    for cogset in args.cldf["CognatesetTable"]:
        data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"])
        # if cogset["ID"] == "13pro":
        #     data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup(
        #         util.comparative_function_paradigm(
        #             ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"],
        #             "1+3 scenarios",
        #             ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"]))

    
    def add_tree_labels(phylo):
        uncertain_nodes = []
        for node in phylo.find_clades():
            if node.name == None or not node.is_terminal():
                continue
            plain_name = node.name.replace("?","")
            if "?" in node.name: uncertain_nodes.append(plain_name)
            if plain_name in lang_ids:
                node.name = lang_ids[plain_name]["Name"].replace("'", "’")
            if plain_name in uncertain_nodes: node.name += "?"
        return phylo, uncertain_nodes
        
    print("Adding trees…")
    own_trees = ["matter"]
    tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw')
    newick_files = {}
    for tree in args.cldf["cariban_trees.csv"]:
        if tree["ID"] in own_trees: continue
        newick_files[tree["ID"]] = {
            "orig": tree["ID"]+"_orig.newick",
            "norm": tree["ID"]+"_norm.newick",
            "source": tree["Source"],
            "comment": tree["Comment"],
            "o_comment": tree["Orig_Comment"]
        }
    #adding my own trees separately.
    for my_tree_count, tree_id in enumerate(own_trees):
        my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick")
        my_tree, uncertain_nodes = add_tree_labels(my_tree)
        
        edited_tree = io.StringIO()
        Phylo.write(my_tree, edited_tree, "newick")
        tree = edited_tree.getvalue().replace(":0.00000","")
        
        my_phylo = Phylogeny(
                tree_id,
                id=tree_id,
                name="Matter (2020)",# % str(my_tree_count+1),
                newick=tree,
                markup_description="My own, conservative, classification."
        )
        
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=my_phylo
                )
            )
              
        DBSession.add(my_phylo)
        
    #adding the other trees
    for tree_id, values in newick_files.items():
        norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick")
        orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick")
        
        norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree)
            
        edited_tree = io.StringIO()
        Phylo.write(norm_biotree, edited_tree, "newick")
        norm_tree = edited_tree.getvalue().replace(":0.00000","")
        
        edited_tree = io.StringIO()
        Phylo.write(orig_biotree, edited_tree, "newick")
        orig_tree = edited_tree.getvalue().replace(":0.00000","")
        
        norm_phylo = Phylogeny(
                id=tree_id+"_norm",
                name=str(data["Source"][values["source"]]) + " (Normalized)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id +
                util.generate_markup(
                    "<br>Comments: %s" % values["comment"]
                ),
                newick=norm_tree
        )
        
        if values["o_comment"] == None:
            o_comment = ""
        else:
            o_comment = values["o_comment"]
        orig_phylo = Phylogeny(
                id=tree_id+"_orig",
                name=str(data["Source"][values["source"]]) + " (Original)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                    "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id +
                    util.generate_markup(
                    "<br>Comments: %s" % values["comment"] +
                    " " + o_comment
                    ),
                newick=orig_tree
        )
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=norm_phylo
                )
            )
        DBSession.add(norm_phylo)
        DBSession.add(orig_phylo)

    print("Adding t-adding verb cognate sets…")
    for t_verb_set in args.cldf["cariban_t_cognates.csv"]:
        cognate_ID = "t"+t_verb_set["ID"]
        rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"])
        t_cogset = data.add(models.Cognateset,
            cognate_ID,
            id=cognate_ID,
            name=rec_t_form,
            description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"],
            cogset_type="t_adding"
        )
        if t_verb_set["Source"]:
            bib_key = t_verb_set["Source"].split("[")[0]
            if len(t_verb_set["Source"].split("[")) > 1:
                pages = t_verb_set["Source"].split("[")[1].split("]")[0]
            else:
                pages = " "
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(models.CognatesetReference(
                    cognateset=t_cogset,
                    source=source,
                    key=source.id,
                    description=pages)
                    )
    
    print("Adding t-adding verbs…")
    t_langs = {}
    t_verbs = {}
    non_t_adding_lgs = ["ing","mac","kar","wmr","pan"]
    data.add(models.Meaning,
        "t_verb",
        id="t-verb",
        name="t-adding verb",
    )
    for t_verb_entry in args.cldf["cariban_t_verbs.csv"]:
        if t_verb_entry["Language_ID"] == "cari1283": continue
        cognate_ID = "t"+t_verb_entry["Cognateset_ID"]
        lang_id = get_lang_id(t_verb_entry["Language_ID"])
        morph_id = lang_id+"_"+cognate_ID
        if morph_id in data["Morpheme"].keys():
            if morph_id + "_2" in data["Morpheme"].keys():
                morph_id += "_3"
            else:
                morph_id += "_2"
        t_verb = data.add(models.Morpheme,
            morph_id,
            id=morph_id,
            morpheme_type="t_adding",
            name=t_verb_entry["Form"],
            language=data["Language"][lang_id],
        )
        DBSession.add(models.Cognate(
                cognateset=data["Cognateset"][cognate_ID],
                counterpart=t_verb
            )
        )
        if t_verb_entry["t"] == "y":
            t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name)
            t_verb.markup_description = util.generate_markup("Shows cogset:t")
        if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs:
            t_verb.name = "[t-?]"+t_verb.name
            t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t")
        if t_verb_entry["t"] == "n":
            t_verb.markup_description = util.generate_markup("Does not show cogset:t")
        if lang_id not in t_langs.keys():
            t_langs[lang_id] = {"y": 0, "n": 0, "?": 0}
        if cognate_ID not in t_verbs.keys():
            t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0}
        t_langs[lang_id][t_verb_entry["t"]] += 1
        if lang_id not in non_t_adding_lgs:
            t_verbs[cognate_ID][t_verb_entry["t"]] += 1
        if t_verb_entry["Source"]:
            add_morpheme_reference(t_verb, t_verb_entry["Source"])

        data.add(models.MorphemeFunction,
            "t_"+t_verb_entry["ID"],
            id="t_"+t_verb_entry["ID"],
            name="t-Verb %s" % t_verb_entry["Parameter_ID"],
            unit=t_verb,
            unitparameter=data["Meaning"]["t_verb"],
            construction=None
        )
    for lang, values in t_langs.items():
        data["Language"][lang].update_jsondata(t_values=values)
    for verb, values in t_verbs.items():
        # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))
        data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"])))

    print("Adding reconstructed lexemes…")
    proto_forms = {}
    for cogset in args.cldf["cariban_lexical_reconstructions.csv"]:
        proto_forms[cogset["ID"]] = cogset["Form"]

    first_found = []
    for entry in args.cldf["cariban_swadesh_list.csv"]:
        cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"]
        if cognateset_ID not in data["Cognateset"]:
            if cognateset_ID in proto_forms:
                form = "*" + proto_forms[cognateset_ID].replace("; ", " / ")
            # else:
            #     form = ""
                data.add(models.Cognateset,
                    cognateset_ID,
                    id=cognateset_ID,
                    name=form,
                    description=cognateset_ID,
                    cogset_type="lexical"
                )
        lang_id = get_lang_id(entry["Language_ID"])
        if lang_id not in data["Language"]: continue
        function = entry["Parameter_ID"].replace(".","_")
        morph_id = entry["Language_ID"] + "_" + function
        if morph_id in first_found: continue
        first_found.append(morph_id)
        if function not in data["Meaning"].keys():
            data.add(models.Meaning,
                function,
                id=function,
                name=function,
                meaning_type="lexical"
            )
        morpheme = data.add(models.Morpheme,
                    morph_id,
                    id=morph_id,
                    morpheme_type="lexical",
                    name=entry["Value"][0],
                    language=data["Language"][lang_id],
                )
        data.add(models.MorphemeFunction,
            "%s:%s" % (morph_id, function),
            id="%s:%s" % (morph_id, function),
            name="MorphemeFunction %s:%s"% (morph_id, function),
            unit=data["Morpheme"][morph_id],
            unitparameter=data["Meaning"][function],
            construction=None
        )
        if entry["Source"]:
            add_morpheme_reference(morpheme, entry["Source"])
        
        if cognateset_ID in proto_forms:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognateset_ID],
                    counterpart=morpheme
                )
            )
Exemple #2
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld'))

        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_, name=name)

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        cr = common.ContributionReference(contribution=contribution,
                                          source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        identifier = common.Identifier(type='iso639-3', id='iso')
        li = common.LanguageIdentifier(language=language,
                                       identifier=identifier)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            _li = common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        vr = common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(id='sentence',
                                   name='sentence name',
                                   description='sentence description',
                                   analyzed='a\tmorpheme\tdoes\tdo',
                                   gloss='a\tmorpheme\t1SG\tdo.SG2',
                                   source='own',
                                   comment='comment',
                                   original_script='a morpheme',
                                   language=language)
        sr = common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))
        DBSession.flush()
Exemple #3
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
Exemple #4
0
def load():
    wals = create_engine('postgresql://robert@/wals3')

    contributor = common.Contributor(id='gastvolker', name='Volker Gast')
    contribution = common.Contribution(
        id='tdir', name='Typological Database of Intensifiers and Reflexives')
    cc = common.ContributionContributor(
        contribution=contribution, contributor=contributor)
    DBSession.add(cc)

    for row in read('glosses'):
        DBSession.add(common.GlossAbbreviation(id=row['gloss'], name=row['explanation']))

    params = {}
    for id_, name in PARAMS.items():
        params[id_] = common.Parameter(id='tdir-' + id_, name=name)
        DBSession.add(params[id_])
        #
        # TODO: domain for sortal restrictions!
        #

    values = {}
    languages = {}
    for row in read('languages'):
        if row['adn'] and '<br>' in row['adn']:
            row['adn'], other = row['adn'].split('<br>', 1)
            if not row['otherint']:
                row['otherint'] = ''
            row['otherint'] = '\n'.join(filter(None, row['otherint'].split('<br>') + other.split('<br>')))

        row['sil'] = row['sil'].lower()
        row['sil'] = {
            'arm': 'hye',
            'vmn': 'mig',
            'gli': 'gle',
            'grk': 'ell',
            'hbr': 'heb',
            'ltn': 'lat',
            'chn': 'cmn',
            'ota': 'ote',
            'pnj': 'pan',
            'pba': 'rap',
            'esg': 'kal',
            'vla': 'zea',
            'lat': 'lav',
        }.get(row['sil'], row['sil'])

        l = common.Language(id=row['sil'].lower(), name=row['language'])
        languages[row['language']] = l
        res = wals.execute("select l.latitude, l.longitude from language as l, languageidentifier as li, identifier as i where l.pk = li.language_pk and li.identifier_pk = i.pk and i.id = '%s' and i.type = 'iso639-3';" \
                           % row['sil']).fetchone()
        if not res:
            res = wals.execute("select latitude, longitude from language where name = '%s';" % row['language']).fetchone()

        if res:
            l.latitude, l.longitude = res
        else:
            print(row['language'], row['sil'])
#(u'Classical Nahuatl', u'nci')   ???
#(u'Ancient Greek', u'gko')

        for pid in params.keys():
            value = row[pid]
            if value:
                value = common.Value(
                    id='tdir-%s-%s' % (pid, l.id),
                    name=unicode(bs(value)),
                    contribution=contribution,
                    parameter=params[pid],
                    language=l)
                values['%s-%s' % (pid, row['language'])] = value
                DBSession.add(value)

    def normalize_ref(ref):
        ref = re.sub('\s+', ' ', ref).strip()
        return unicode(bs(ref)).replace('<i>', '"').replace('</i>', '"')

    """
Ogawa, A. (1998)
Wali, K. et al. (2000)

Lyutikova. -> Lyutikova,
se-Bertit -> se-Berit

missing refs:
Sengupta, G. (2000). Lexical anaphors and pronouns in Bangla. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter.
Davison, A. Mistry (2000). Lexical anaphors and pronouns in Hindi/Urdu. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter.

"""

    refs = {}
    for row in read('references'):
        name = re.sub('\s+', ' ', row['entry'].split(').')[0].strip()) + ')'
        src = common.Source(
            id=row['ref'].strip(), name=name, description=normalize_ref(row['entry']))
        refs[name] = src
        DBSession.add(src)

    for row in read('examples'):
        if row['language'] not in languages:
            print('example for unknown language "%s"' % row['language'])
            continue

        s = common.Sentence(
            id=row['Nr'].strip(),
            name=fix_example(row['original'], repl=' '),
            language=languages[row['language']],
            analyzed=fix_example(row['original']),
            gloss=fix_example(row['gloss']),
            description=row['translation'],
            source=row['source'],
            comment=row['comments'])

        has_refs = False
        for ref in refs:
            if ref in row['source']:
                if normalize_ref(row['source']) != refs[ref].description:
                    print('-->')
                    print(row['source'])
                has_refs = True
                common.SentenceReference(sentence=s, source=refs[ref])

        if not has_refs:
            print('+++++')
            print(row['source'])

        pid = EXAMPLE_MAP[row['pov']]
        if pid:
            # associate with value!
            o = common.ValueSentence(value=values['%s-%s' % (pid, row['language'])], sentence=s)

        DBSession.add(s)
Exemple #5
0
def populate_test_db(engine):
    set_alembic_version(engine, '58559d4eea0d')

    data = TestData()
    data.add_default(common.Dataset,
                     domain='clld',
                     jsondata={
                         'license_icon': 'cc-by',
                         'license_url': 'http://example.org'
                     })

    data.add_default(common.Contributor, name='A Name', email='*****@*****.**')
    for id_, name in {
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name',
    }.items():
        data.add(common.Contributor,
                 id_,
                 id=id_,
                 name=name,
                 url='http://example.org')

    DBSession.add(
        common.Editor(dataset=data[common.Dataset],
                      contributor=data[common.Contributor]))

    data.add_default(common.Source)
    data.add(common.Source,
             'replaced',
             id='replaced',
             active=False,
             jsondata={'__replacement_id__': 'source'})

    data.add_default(common.Contribution)
    common.ContributionReference(contribution=data[common.Contribution],
                                 source=data[common.Source])

    for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'),
                       (False, 'd')]:
        common.ContributionContributor(contribution=data[common.Contribution],
                                       primary=primary,
                                       contributor=data['Contributor'][c])

    data.add_default(common.Language, latitude=10.5, longitude=0.3)
    data[common.Language].sources.append(data[common.Source])

    for i, type_ in enumerate(common.IdentifierType):
        common.LanguageIdentifier(
            language=data[common.Language],
            identifier=common.Identifier(
                type=type_.value,
                id=type_.value + str(i),
                name='abc' if type_.name == 'iso' else 'glot1234'))

    common.LanguageIdentifier(language=data[common.Language],
                              identifier=common.Identifier(type='name',
                                                           id='name',
                                                           name='a'))

    for i in range(2, 102):
        _l = common.Language(id='l%s' % i, name='Language %s' % i)
        _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc')
        common.LanguageIdentifier(language=_l, identifier=_i)
        DBSession.add(_l)

    param = data.add_default(common.Parameter)
    de = common.DomainElement(id='de', name='DomainElement', parameter=param)
    de2 = common.DomainElement(id='de2',
                               name='DomainElement2',
                               parameter=param)

    valueset = data.add_default(common.ValueSet,
                                language=data[common.Language],
                                parameter=param,
                                contribution=data[common.Contribution])
    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')

    data.add_default(common.Value,
                     domainelement=de,
                     valueset=valueset,
                     frequency=50,
                     confidence='high')
    data.add(common.Value,
             'value2',
             id='value2',
             domainelement=de2,
             valueset=valueset,
             frequency=50,
             confidence='high')

    paramnd = data.add(common.Parameter,
                       'no-domain',
                       id='no-domain',
                       name='Parameter without domain')
    valueset = common.ValueSet(id='vs2',
                               language=data[common.Language],
                               parameter=paramnd,
                               contribution=data[common.Contribution])

    common.ValueSetReference(valueset=valueset,
                             source=data[common.Source],
                             description='10-20')
    common.Value(id='v2', valueset=valueset, frequency=50, confidence='high')

    unit = data.add_default(common.Unit, language=data[common.Language])
    up = data.add_default(common.UnitParameter)
    common.UnitValue(id='unitvalue',
                     name='UnitValue',
                     unit=unit,
                     unitparameter=up)

    up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
    de = common.UnitDomainElement(id='de', name='de', parameter=up2)
    DBSession.add(
        common.UnitValue(id='uv2',
                         name='UnitValue2',
                         unit=unit,
                         unitparameter=up2,
                         unitdomainelement=de))

    DBSession.add(common.Source(id='s'))

    sentence = data.add_default(common.Sentence,
                                description='sentence description',
                                analyzed='a\tmorpheme\tdoes\tdo',
                                gloss='a\tmorpheme\t1SG\tdo.SG2',
                                source='own',
                                comment='comment',
                                original_script='a morpheme',
                                language=data[common.Language],
                                jsondata={'alt_translation': 'Spanish: ...'})
    common.SentenceReference(sentence=sentence, source=data[common.Source])
    DBSession.add(common.Config(key='key', value='value'))

    common.Config.add_replacement('replaced',
                                  'language',
                                  model=common.Language)
    common.Config.add_replacement('gone', None, model=common.Language)
    DBSession.flush()
Exemple #6
0
    def setUp(self):
        TestWithDb.setUp(self)

        DBSession.add(
            common.Dataset(id='dataset',
                           name='dataset',
                           description='desc',
                           domain='clld',
                           jsondata={'license_icon': 'cc-by'}))

        DBSession.add(
            common.Source(id='replaced',
                          active=False,
                          jsondata={'__replacement_id__': 'source'}))
        source = common.Source(id='source')
        contributors = {
            'contributor': 'A Name',
            'b': 'b Name',
            'c': 'c Name',
            'd': 'd Name'
        }
        for id_, name in contributors.items():
            contributors[id_] = common.Contributor(id=id_,
                                                   name=name,
                                                   url='http://example.org')

        contribution = common.Contribution(id='contribution',
                                           name='Contribution')
        common.ContributionReference(contribution=contribution, source=source)
        assert common.ContributionContributor(
            contribution=contribution,
            primary=True,
            contributor=contributors['contributor'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['b'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=True,
                                              contributor=contributors['c'])
        assert common.ContributionContributor(contribution=contribution,
                                              primary=False,
                                              contributor=contributors['d'])

        DBSession.add(contribution)

        language = common.Language(id='language',
                                   name='Language 1',
                                   latitude=10.5,
                                   longitude=0.3)
        language.sources.append(source)
        for i, type_ in enumerate(common.IdentifierType):
            id_ = common.Identifier(type=type_.value,
                                    id=type_.value + str(i),
                                    name='abc')
            common.LanguageIdentifier(language=language, identifier=id_)

        for i in range(2, 102):
            _l = common.Language(id='l%s' % i, name='Language %s' % i)
            _i = common.Identifier(type='iso639-3',
                                   id='%.3i' % i,
                                   name='%.3i' % i)
            common.LanguageIdentifier(language=_l, identifier=_i)
            DBSession.add(_l)

        param = common.Parameter(id='parameter', name='Parameter')
        de = common.DomainElement(id='de',
                                  name='DomainElement',
                                  parameter=param)
        de2 = common.DomainElement(id='de2',
                                   name='DomainElement2',
                                   parameter=param)
        valueset = common.ValueSet(id='valueset',
                                   language=language,
                                   parameter=param,
                                   contribution=contribution)
        value = common.Value(id='value',
                             domainelement=de,
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)
        value2 = common.Value(id='value2',
                              domainelement=de2,
                              valueset=valueset,
                              frequency=50,
                              confidence='high')
        DBSession.add(value2)
        paramnd = common.Parameter(id='no-domain',
                                   name='Parameter without domain')
        valueset = common.ValueSet(id='vs2',
                                   language=language,
                                   parameter=paramnd,
                                   contribution=contribution)
        common.ValueSetReference(valueset=valueset, source=source)
        value = common.Value(id='v2',
                             valueset=valueset,
                             frequency=50,
                             confidence='high')
        DBSession.add(value)

        unit = common.Unit(id='unit', name='Unit', language=language)
        up = common.UnitParameter(id='unitparameter', name='UnitParameter')
        DBSession.add(unit)
        DBSession.add(
            common.UnitValue(id='unitvalue',
                             name='UnitValue',
                             unit=unit,
                             unitparameter=up))

        up2 = common.UnitParameter(id='up2', name='UnitParameter with domain')
        de = common.UnitDomainElement(id='de', name='de', parameter=up2)
        DBSession.add(
            common.UnitValue(id='uv2',
                             name='UnitValue2',
                             unit=unit,
                             unitparameter=up2,
                             unitdomainelement=de))

        DBSession.add(common.Source(id='s'))

        sentence = common.Sentence(
            id='sentence',
            name='sentence name',
            description='sentence description',
            analyzed='a\tmorpheme\tdoes\tdo',
            gloss='a\tmorpheme\t1SG\tdo.SG2',
            source='own',
            comment='comment',
            original_script='a morpheme',
            language=language,
            jsondata={'alt_translation': 'Spanish: ...'})
        common.SentenceReference(sentence=sentence, source=source)
        DBSession.add(common.Config(key='key', value='value'))

        common.Config.add_replacement('replaced',
                                      'language',
                                      model=common.Language)
        common.Config.add_replacement('gone', None, model=common.Language)
        DBSession.flush()
Exemple #7
0
def main(args):
    data = Data()
    doi = input('DOI of the released dataset: ')

    dataset = common.Dataset(
        id=ewave.__name__,
        name='eWAVE',
        description='The Electronic World Atlas of Varieties of English',
        domain='ewave-atlas.org',
        published=date.today(),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'doi': doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)

    ed_pattern = re.compile('ed(?P<ord>[0-9]+)$')
    for c in args.cldf['contributors.csv']:
        contrib = data.add(
            models.WaveContributor,
            c['ID'],
            id=c['ID'],
            name=c['Name'],
            email=c['Email'],
            url=c['URL'],
            address=c['Address'],
            sortkey=HumanName(c['Name']).last,
        )
        m = ed_pattern.match(c['ID'])
        if m:
            common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord')))

    for fc in args.cldf['featurecategories.csv']:
        data.add(
            models.FeatureCategory, fc['ID'],
            id=fc['ID'], name=fc['Name'], description=fc['Description'])

    for vt in args.cldf['varietytypes.csv']:
        data.add(
            models.VarietyType, vt['ID'],
            id=vt['ID'],
            name=vt['Name'],
            description=vt['Description'],
            jsondata=VARIETY_TYPE_ICONS[vt['ID']],
        )

    for vt in args.cldf['regions.csv']:
        data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name'])

    for lang in args.cldf['LanguageTable']:
        l = data.add(
            models.Variety, lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
            latitude=lang['Latitude'],
            longitude=lang['Longitude'],
            abbr=lang['abbr'],
            region=data['Region'][lang['Region_ID']],
            type=data['VarietyType'][lang['Type_ID']],
        )
        if lang['Glottocode']:
            add_language_codes(data, l, None, glottocode=lang['Glottocode'])
        c = data.add(
            models.WaveContribution, lang['ID'],
            id=str(lang['ID']),
            name=lang['Name'],
            description=lang['Description'],
            variety=l)
        for i, cid in enumerate(lang['Contributor_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=data['WaveContributor'][cid],
                ord=i+1,
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            models.Feature, param['ID'],
            id=param['ID'],
            category=data['FeatureCategory'][param['Category_ID']],
            name=param['Name'],
            description=param['Description'],
            jsondata={'example_source': param['Example_Source']})


    for de in args.cldf['CodeTable']:
        data.add(
            common.DomainElement, de['ID'],
            id=de['ID'],
            parameter=data['Feature'][de['Parameter_ID']],
            name=de['Name'],
            description=de['Description'],
            jsondata={'color': CODE_COLORS[de['Name']]},
            number=de['Number'])

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for example in args.cldf['ExampleTable']:
        s = data.add(
            common.Sentence, example['ID'],
            id=example['ID'],
            name=example['Primary_Text'],
            gloss='\t'.join(example['Gloss']) if example['Gloss'] else None,
            comment=example['Comment'] or None,
            description=example['Translated_Text'] or None,
            language=data['Variety'][example['Language_ID']])

        for ref in example['Source']:
            sid, pages = Sources.parse(ref)
            DBSession.add(common.SentenceReference(
                sentence=s, source=data['Source'][sid], description=pages, key=sid))

    for value in args.cldf['ValueTable']:
        de = data['DomainElement'][value['Code_ID']]
        vs = data.add(
            common.ValueSet, value['ID'],
            id=value['ID'],
            contribution=data['WaveContribution'][value['Language_ID']],
            parameter=data['Feature'][value['Parameter_ID']],
            jsondata=de.jsondata,
            language=data['Variety'][value['Language_ID']])
        v = data.add(
            common.Value, value['ID'],
            id=value['ID'],
            domainelement=de,
            valueset=vs)

        for eid in value['Example_ID']:
            DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))