def main(args): user = getpass.getuser() data = Data() datadir = 'C:\\Python27\\glottobank\\Grambank\\' if user != 'robert' \ else '/home/robert/venvs/glottobank/Grambank' dataset = common.Dataset( id=grambank.__name__, name="GramBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(datadir, data) import_cldf(os.path.join(datadir, 'datasets'), data) #print data.keys() #print data['Parameter'].keys() #parameter = data['Parameter'].get(row['Feature_ID']) load_families(data, data['GrambankLanguage'].values(), isolates_icon='tcccccc')
def main(args): data = Data() dataset = common.Dataset( id=culturebank.__name__, name="CultureBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='culturebank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'No license yet'}) # Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(CULTUREBANK_REPOS, data) import_cldf(os.path.join(CULTUREBANK_REPOS, 'datasets'), data) ##import_cldf("C:\\python27\\dbs\\bwohh\\", data, add_missing_features = True) load_families( data, list(data['CulturebankLanguage'].values()), isolates_icon='tcccccc') return
def main(args): datadir = '/home/robert/venvs/glottobank/lexibank' with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="LexiBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for provider in [ 'transnewguinea', 'abvd', 'ids', ]: import_cldf(os.path.join(datadir, provider, 'cldf'), provider) with transaction.manager: load_families(Data(), DBSession.query(LexibankLanguage), isolates_icon='tcccccc')
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path(os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families( Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def main(args): #TODO explain etc diachronic_strength #sigtests of dependencies #isogloss-maps data = Data() dataset = common.Dataset( id=grambank.__name__, name="Grambank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog = Glottolog(GLOTTOLOG_REPOS) languoids = {l.id: l for l in glottolog.languoids()} import_gb20_features(GRAMBANK_REPOS, data) import_cldf(os.path.join(GRAMBANK_REPOS, 'datasets'), data, languoids) load_families( data, data['GrambankLanguage'].values(), glottolog=languoids, isolates_icon='tcccccc') # Add isolates for lg in data['GrambankLanguage'].values(): gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family return
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path(os.path.expanduser('~')).joinpath('venvs/lexibank/lexibank-data') with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="lexibank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog = Glottolog( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} concepticon = Concepticon( Path(lexibank.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data')) conceptsets = {c['ID']: c for c in concepticon.conceptsets()} for dname in repos.joinpath('datasets').iterdir(): #if dname.name not in ['acbd']: # continue if dname.is_dir() and dname.name != '_template': #if dname.name != 'zenodo34092': # continue mdpath = dname.joinpath('metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families( Data(), DBSession.query(LexibankLanguage), glottolog=languoids, isolates_icon='tcccccc')
def test_load_families(self): from clld_glottologfamily_plugin.util import load_families class Languoid(object): id = 'abcd1234' iso_code = 'abc' name = 'language' latitude = 1.0 longitude = 1.0 macroareas = ['Area'] @property def family(self): return self class TopLevelFamily(object): id = 'abcd1234' iso_code = 'abc' name = 'family' latitude = 1.0 longitude = 1.0 macroareas = ['Area'] level = 'family' @property def family(self): return None class Glottolog(object): def languoid(self, code): if code == 'abc': return TopLevelFamily() return Languoid() load_families(Data(), DBSession.query(LanguageWithFamily), glottolog=Glottolog()) load_families( Data(), [('abc', l) for l in DBSession.query(LanguageWithFamily)], glottolog=Glottolog())
def test_load_families(self): from clld_glottologfamily_plugin.util import load_families class Languoid(object): id = 'abcd1234' iso_code = 'abc' name = 'language' latitude = 1.0 longitude = 1.0 macroareas = ['Area'] @property def family(self): return self class Glottolog(object): def languoid(self, code): return Languoid() load_families(Data(), DBSession.query(LanguageWithFamily), glottolog=Glottolog())
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))
def main(args): Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\ .create(DBSession.bind) data = Data() dataset = common.Dataset( id=numerals.__name__, name="Numeralbank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain="numerals.clld.org", jsondata={ "license_icon": "cc-by.png", "license_name": "Creative Commons Attribution 4.0 International License", }, ) DBSession.add(dataset) for i, (id_, name) in enumerate( [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")] ): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) # Take meta data from curated CLDF data set ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Parameters: for parameter in ds["ParameterTable"]: data.add( models.NumberParameter, parameter["ID"], id=parameter["ID"], name="{0}".format(parameter["ID"]), concepticon_id=parameter['Concepticon_ID'], ) basis_parameter = data.add( models.NumberParameter, "0", id="0", name="Base", ) load_family_langs = [] for language in ds["LanguageTable"]: lang = data.add( models.Variety, language["ID"], id=language["ID"], name=language["Name"], latitude=language["Latitude"], longitude=language["Longitude"], creator=language["Contributor"], comment=language["Comment"], url_soure_name=language["SourceFile"], ) if language["Glottocode"]: load_family_langs.append((language["Glottocode"], lang)) # get orginal forms ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json') org_forms = {f["ID"]: f for f in ds["FormTable"]} d = data_repos[1] contrib = data.add( common.Contribution, d['id'], id=d['id'], name=d['name'] ) # process curated forms ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Add Base info if given for language in ds["LanguageTable"]: if language["Base"]: basis = language["Base"] de = data["DomainElement"].get(basis) if not de: de = data.add( common.DomainElement, basis, id=text_type(basis), name=text_type(basis), parameter=basis_parameter, ) vs = data.add( common.ValueSet, data["Variety"][language["ID"]].id, id=data["Variety"][language["ID"]].id, language=data["Variety"][language["ID"]], parameter=basis_parameter, contribution=contrib, ) common.Value( id=data["Variety"][language["ID"]].id, valueset=vs, domainelement=de ) # Forms: for form in ds["FormTable"]: valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"]) valueset = data["ValueSet"].get(valueset_id) # Unless we already have something in the VS: if not valueset: if form["Language_ID"] in data["Variety"]: vs = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data["Variety"][form["Language_ID"]], parameter=data["NumberParameter"][form["Parameter_ID"]], contribution=contrib, ) org_form = "" if form["ID"] in org_forms: if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]: org_form = org_forms[form["ID"]]["Form"] else: org_form = "no original form" DBSession.add( models.NumberLexeme( id=form["ID"], name=form["Form"], comment=form["Comment"], is_loan=form["Loan"], other_form=form["Other_Form"], org_form=org_form, is_problematic=form["Problematic"], valueset=vs, ) ) load_families( Data(), load_family_langs, glottolog_repos=gl_repos, strict=False, ) distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all() families = dict( zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties))) ) for l in DBSession.query(models.Variety): l.jsondata = {"color": families[l.family_pk]} p = common.Parameter.get("0") colors = color.qualitative_colors(len(p.domain)) for i, de in enumerate(p.domain): de.jsondata = {"color": colors[i]}
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Unit.name)).create(DBSession.bind) data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: print('no md', submission.id) continue if not md['date_published']: print('no date', submission.id) continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': print('not selected', submission.id) continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] props.setdefault('choices', {}) language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), doi=md.get('doi'), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=spec.get('primary', True), contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_sources(Dictionary.get(did), dictdata) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog/glottolog')
def main(args): """ The case is we have to codings for two different dialects (called hua and yagaria) of the same iso "qgr", both of which we want to keep and keep separately. I had missed that when making NTS, rigging everything so that the iso would be the id, which is not sufficient. Glottocodes in Grambank would have taken care of it except the dialect division for yaga1260 is wrong, having yagaria as overarching and Hua under it (reality has it that Hua and Yagaria are two dialects of the same language, which has no name). So a solution with glottocodes would have to wait until we fix that or need another fix later. So I guess, for now, let's ignore qgr (and its datapoints) and I'll fix on my end later. """ data = Data( created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() dtab = partial(_dtab, args.data_file()) #Languages tabfns = ['%s' % fn.name for fn in args.data_file().glob('nts_*.tab')] args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr': continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add( models.ntsLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() load_families(data, [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['ntsLanguage'].values()], isolates_icon='tcccccc') #glottolog = Glottolog() #for lg in data['ntsLanguage'].values(): # print lg.id, NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id) # gl_language = glottolog.languoid(NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id)) # if not gl_language.family: # family = data.add(Family, gl_language.id, id = gl_language.id, name = gl_language.name, description=common.Identifier(name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) # lg.family = family #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): designer_id = str(i + 1) data.add( models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush() #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [(ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values")]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for (fid, vdescs) in fvmis: print fid, "DIFF VDESC" for (vd, fromf) in vdescs: print vd, set(fromf) for _, dfsids in groupby( sorted((f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): assert len(list(dfsids)) == 1 for fid, f in fs: if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} vicons = icons.iconize(vi.keys()) for v, desc in vdesc.items(): data.add( common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": vicons[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2( [((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn( "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = ( ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error(msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']]))) print msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, contribution=parameter.designer) models.ntsValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata}, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k]) DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) #, ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "nts.cldf", encoding = "utf-8") #utf-16 "Comment", "Source", "Bibliographical Details" #cldf = {} #for ld in ldps: # parameter = data['Feature'][ld['feature_alphanumid']] # language = data['ntsLanguage'][ld['language_id']] # id_ = '%s-%s' % (parameter.id, language.id) # if not id_ in done: # continue # dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")), ld.get("feature_information", ""), ld.get('feature_possible_values', ""), ld["designer"], ld.get("abbreviation", ""), ld["feature_domain"], ld.get('francais', ""), ld.get("dependencies", ""), ld.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", "")) # cldf[dt] = None #savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment", "Source", "Bibliographical Details", "Feature Information", "Feature Possible Values", "Feature Designer", "Feature Abbreviation", "Feature Domain", "Feature (French)", "Feature Dependencies", "Feature Clarifying Comments")] + cldf.keys()), "nts-with-metadata.tsv", encoding="utf-16") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="NTS", name='Nijmegen Typological Survey', publisher_name="Max Planck Institute for Psycholinguistics", publisher_place="Nijmegen", publisher_url="http://www.mpi.nl", description="""Dataset on Typological Features, collected 2013-2014 in the Language and Cognition Department at the Max Planck Institute for Psycholinguistics, Max-Planck Gesellschaft, and a European Research Council's Advanced Grant (269484 "INTERACT") to Stephen C. Levinson.""", domain='http://nts.clld.org', published=date(2014, 2, 20), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'}) for i, contributor in enumerate([ common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**"), common.Contributor( id="Suzanne van der Meer", name="Suzanne van der Meer", email="*****@*****.**"), common.Contributor( id="Hedvig Skirgard", name="Hedvig Skirgard", email="*****@*****.**") ]): common.Editor(dataset=dataset, contributor=contributor, ord=i) DBSession.add(dataset)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() def read(table): return list(dsv.reader( args.data_file(table + '.csv'), delimiter=',', namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", #published=date(2009, 8, 15), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-nc-nd/2.0/de/88x31.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany License', }, domain='ids.clld.org') DBSession.add(dataset) data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=l.lg_name) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso_codes = {l.id: l.sil_code for l in read('sil_lang')} languages = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil') if l.lg_id not in exclude} load_families(Data(), [(v, data['IdsLanguage'][k]) for k, v in languages.items()]) contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: if int(l.what_did_id) not in [4, 395]: print(l.what_did_id) raise ValueError sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="Max Planck Institute for Evolutionary Anthropology, Leipzig") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) for i, name in enumerate(sorted(sources.keys())): c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for lg in lgs: if lg in exclude: continue try: DBSession.add(common.LanguageSource( language_pk=data['IdsLanguage'][lg].pk, source_pk=data['Source'][name].pk)) except KeyError: print(name, lgs) continue altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = {'id': id_, 'name': name, 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() try: language = common.Language.get(data['IdsLanguage'][lg_id]) except KeyError: print(list(entries)) raise desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: data.add( models.Entry, entry_id, id=entry_id, name=entry_id, #active=False, sub_code=l.entry_id, chapter_pk=data['Chapter'][l.chap_id]) DBSession.flush() data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) #assert language.id == '238' # Rapa Nui has problems! trans2 = None for i, word in enumerate(trans1): v = models.Counterpart( id=id_ + '-' + str(i + 1 + len(vs.values)), name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: print('---', language.id, language.name) print(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned)
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('moselulrike', 'Ulrike Mosel'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: continue if not md['date_published']: continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: #if submission.id != 'sidaama': # continue transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog3/glottolog')
def main(args): internal = input( '[i]nternal or [e]xternal data (default: e): ').strip().lower() == 'i' which_submission = input( "submission id or 'all' for all submissions (default: all): ").strip( ).lower() or 'all' data = Data() dataset = common.Dataset( id=crossgram.__name__, name='Crossgram', description='Crossgram', published=date(2019, 12, 12), domain='crossgram.clld.org', # XXX Is any of this correct? publisher_name='Max Planck Institute for the Science of Human History', publisher_place='Jena', publisher_url='https://ssh.mpg.de', license='http://creativecommons.org/licenses/by/4.0', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) internal_repo = pathlib.Path('../../crossgram/crossgram-internal') cache_dir = internal_repo / 'datasets' cache_dir.mkdir(exist_ok=True) if internal: submissions_path = internal_repo / 'submissions-internal' else: submissions_path = internal_repo / 'submissions' language_id_map = {} for contrib_dir in submissions_path.iterdir(): if not contrib_dir.is_dir(): continue if which_submission != 'all' and which_submission != contrib_dir.name: continue sid = contrib_dir.name print('Loading submission', sid, '...') contrib_md = jsonlib.load(contrib_dir / 'md.json') intro = None try: with (contrib_dir / 'intro.md').open(encoding='utf-8') as f: intro = f.read() except IOError: # If there is no intro, there is no intro *shrug* pass path = download_data(sid, contrib_md, cache_dir) if not path.exists(): print('could not find folder', str(path)) continue submission = CLDFBenchSubmission.load(path, contrib_md) date_match = re.fullmatch('(\d+)-(\d+)-(\d+)', contrib_md['published']) assert date_match yyyy, mm, dd = date_match.groups() published = date(int(yyyy), int(mm), int(dd)) # strip off ssh stuff off git link git_https = re.sub('^git@([^:]*):', r'https://\1/', contrib_md.get('repo') or '') contrib = data.add(models.CrossgramData, sid, id=sid, number=int(contrib_md['number']), published=published, name=submission.title, doi=contrib_md.get('doi'), git_repo=git_https, description=intro or submission.readme) submission.add_to_database(data, language_id_map, contrib) print('... done') DBSession.flush() print('Loading language family data...') catconf = cldfcatalog.Config.from_file() glottolog_path = catconf.get_clone('glottolog') load_families(Data(), [ v for v in DBSession.query(models.Variety) if re.fullmatch('[a-z]{4}[0-9]{4}', v.id) ], strict=False, glottolog_repos=glottolog_path) print('... done')
def main(args): _ = args data = Data() cldf_data = args.cldf data.add(common.Contributor, 'fehnannemarie', id='fehnannemarie', name="Anne-Marie Fehn", url="https://shh.mpg.de") # TODO: Editors/Contributors dataset = common.Dataset(id=kba.__name__, name="KBA", publisher_name="Max Planck Institute for the " "Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by" "/4.0/", domain='kba.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons ' 'Attribution 4.0 ' 'International ' 'License' }) DBSession.add(dataset) for i, editor in enumerate(['fehnannemarie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for language in cldf_data['LanguageTable']: lang = data.add(models.KbaLanguage, language['ID'], id=language['ID'], name=language['Name']) add_language_codes(data, lang, None, glottocode=language['Glottocode']) # TODO: Concepticon for parameter in cldf_data['ParameterTable']: data.add(common.Parameter, parameter['ID'], id=parameter['ID'], name='{0} ({1})'.format(parameter['Name'], parameter['ID'])) for form in cldf_data['FormTable']: valueset_id = '{0}-{1}'.format(form['Parameter_ID'], form['Language_ID']) valueset = data['ValueSet'].get(valueset_id) # Unless we already have something in the VS: if not valueset: valueset = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data['KbaLanguage'][form['Language_ID']], parameter=data['Parameter'][form['Parameter_ID']], contribution=contrib) DBSession.add( models.Word(id=form['ID'], name=form['Form'], comment=form.get('Comment'), sourceorthography=form.get('sourceorthography'), kbaorthography=form.get('kbaorthography'), wordclass=form.get('wordclass'), grammaticalnotes=form.get('grammaticalnotes'), idiolectalvariant=form.get('idiolectalvariant'), originaltranslation=form.get('originaltranslation'), valueset=valueset)) load_families(data, [(l.glottocode, l) for l in data['KbaLanguage'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc')
def _main(data, glottolog): languoids = list(glottolog.languoids()) lbyi = {l.iso: l for l in languoids if l.iso} dataset = common.Dataset( id='ldh', name='Language Description Heritage', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='ldh.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) DBSession.add( common.Editor(dataset=dataset, contributor=common.Contributor(id='forkel', name='Robert Forkel'))) ls = set() for post in iter_posts(): if post.pure_item_id: item = pure.Item.from_json(post.pure_item_id) src = data['Description'].get(item.id) if not src: src = data.add( models.Description, item.id, id=item.id, description=item.title, name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item.title, address=item.publisher.get('place') if item.publisher else None, publisher=item.publisher.get('publisher') if item.publisher else None, author=' and '.join(item.authors), editor=' and '.join(item.editors), pid=item.doi or item.pid, pid_type='doi' if item.doi else 'hdl', ) DBSession.flush() for file in item.files: if file.visibility == 'PUBLIC' \ and file.metadata["contentCategory"] == "any-fulltext"\ and file.storage == 'INTERNAL_MANAGED': assert file.mimeType == 'application/pdf' DBSession.add( common.Source_files( id=file.pid.replace('/', '__'), name=file.name, object_pk=src.pk, mime_type=file.mimeType, jsondata=dict(size=file.size, license=attr.asdict(file.license) if file.license else None), )) for iso in item.isocodes: if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) for item in zenodo.iter_items(): src = data.add( models.Description, item.id, id=item.id, description=item['metadata']['title'], name=item.name, bibtex_type=EntryType.get(item.bibtex_type), year=item.year, title=item['metadata']['title'], publisher='Zenodo', author=' and '.join(a['name'] for a in item['metadata']['creators']), pid=item['metadata']['doi'], pid_type='doi', ) DBSession.flush() for file in item['files']: license = licenses.find(item['metadata']['license']['id']) DBSession.add( common.Source_files( id=file['checksum'].replace('md5:', ''), name=file['key'], object_pk=src.pk, mime_type='application/' + file['type'], jsondata=dict( size=file['size'], url=file['links']['self'], license=attr.asdict(license) if license else None), )) for kw in item['metadata']['keywords']: if not kw.startswith('iso:'): continue iso = kw.replace('iso:', '') if iso in lbyi: gl = lbyi[iso] l = data['LDHLanguage'].get(iso) if not l: l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name) DBSession.flush() if (item.id, iso) not in ls: DBSession.add( common.LanguageSource(language_pk=l.pk, source_pk=src.pk)) ls.add((item.id, iso)) load_families(data, data['LDHLanguage'].values(), glottolog_repos=glottolog.repos, isolates_icon='tcccccc')
def main(args): # pragma: no cover get_repos() api = Grambank(REPOS['Grambank']) cldf = args.cldf data = Data() dataset = models.Grambank( id=grambank.__name__, name="Grambank", description="Grambank", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) contributors = {} for i, contrib in enumerate(api.contributors): contrib = common.Contributor( contrib.id, id=contrib.id, name=contrib.name, ) common.Editor(dataset=dataset, contributor=contrib, ord=i) DBSession.add(contrib) DBSession.flush() contributors[contrib.id] = contrib.pk contributions = {r['ID']: r for r in cldf['LanguageTable']} DBSession.add(dataset) for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)), desc='sources'): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() sources = {k: v.pk for k, v in data['Source'].items()} features, codes = import_features(cldf, contributors) transaction.commit() values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby( sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']), lambda r: r['Language_ID'], )] for lid, values in tqdm(values_by_sheet, desc='loading values'): transaction.begin() import_values(values, contributions[lid], features, codes, contributors, sources) transaction.commit() transaction.begin() glottolog = Glottolog(REPOS['glottolog']) languoids = {l.id: l for l in glottolog.languoids()} gblangs = DBSession.query(models.GrambankLanguage).all() load_families(data, gblangs, glottolog_repos=REPOS['glottolog'], isolates_icon='dcccccc') # Add isolates for lg in gblangs: gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family coverage.main(glottolog) return
def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json')) data = Data() data.add( common.Contributor, 'barthwolfgang', id='barthwolfgang', name="Wolfgang Barth", url="http://www.dynamicsoflanguage.edu.au/") # # FIXME: get dataset attributes from CLDF metadata! # dataset = common.Dataset( id='parabank', name='Parabank Pronouns', description='Database of pronouns', domain='parabank.clld.org', publisher_name="CoEDL Centre of Excellence for the Dynamics of Language", publisher_place="Canberra, Australia", publisher_url="http://www.dynamicsoflanguage.edu.au/", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0'}) DBSession.add(dataset) for i, editor in enumerate(['barthwolfgang']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for l in wl['LanguageTable']: lang = data.add( models.ParabankLanguage, l['ID'], id=l['ID'], name=l['Name'], description=l['Notes'], source=l['Source_Citation'], classification=l['Classification'], ) add_language_codes(data, lang, None, glottocode=l['Glottocode']) for p in wl['ParameterTable']: data.add( common.Parameter, p['ID'], id=p['ID'], name='{0} ({1})'.format(p['Name'], p['ID']), #description=p['Description'], ) for f in wl['FormTable']: vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=vsid, language=data['ParabankLanguage'][f['Language_ID']], parameter=data['Parameter'][f['Parameter_ID']], contribution=contrib) DBSession.add(models.Word( id=f['ID'], name=f['Form'], comment=f.get('Comment'), original=f['Original_parameter'], valueset=vs)) load_families( data, [(l.glottocode, l) for l in data['ParabankLanguage'].values()], glottolog_repos=args.data_file('glottolog'), isolates_icon='tcccccc')
def main(args): data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2015, 10, 1), contact='*****@*****.**', domain='dictionaria.clld.org', license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) ed = data.add( common.Contributor, 'hartmanniren', id='hartmanniren', name='Iren Hartmann') common.Editor(dataset=dataset, contributor=ed) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} comparison_meanings_alt_labels = {} print('loading concepts ...') concepticon = Concepticon() for i, concept_set in enumerate(concepticon.resources('parameter').members): concept_set = concepticon.resource(concept_set) cm = ComparisonMeaning( id=concept_set.id, name=concept_set.name.lower(), description=concept_set.description, concepticon_url='%s' % concept_set.uriref) DBSession.add(cm) comparison_meanings[cm.name] = cm for label in concept_set.alt_labels: comparison_meanings_alt_labels.setdefault(label.lower(), cm) DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} comparison_meanings_alt_labels = { k: v.pk for k, v in comparison_meanings_alt_labels.items()} submissions = [] for submission in REPOS.joinpath('submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md id_ = submission.id lmd = md['language'] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) dictionary = data.add( Dictionary, id_, id=id_, name=lmd['name'] + ' Dictionary', language=language, published=date(*map(int, md['published'].split('-')))) for i, cname in enumerate(md['authors']): name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add(common.Contributor, cid, id=cid, name=cname) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: try: mod = __import__( 'dictionaria.loader.' + submission.id, fromlist=['MARKER_MAP']) marker_map = mod.MARKER_MAP except ImportError: marker_map = {} transaction.begin() print('loading %s ...' % submission.id) submission.load( did, lid, comparison_meanings, comparison_meanings_alt_labels, marker_map) transaction.commit() print('... done') #('hoocak', 'Hooca\u0328k', 43.5, -88.5, [('hartmanniren', 'Iren Hartmann')]), #('yakkha', 'Yakkha', 27.37, 87.93, [('schackowdiana', 'Diana Schackow')]), #('palula', 'Palula', 35.51, 71.84, [('liljegrenhenrik', 'Henrik Liljegren')], {}), #('daakaka', 'Daakaka', -16.27, 168.01, [('vonprincekilu', 'Kilu von Prince')], # {'published': date(2015, 9, 30), 'iso': 'bpa', 'glottocode': 'daka1243'}), #('teop', 'Teop', -5.67, 154.97, [('moselulrike', 'Ulrike Mosel')], # {'published': date(2015, 9, 30), 'iso': 'tio', 'glottocode': 'teop1238', 'encoding': 'latin1'}), transaction.begin() load_families(Data(), DBSession.query(Variety))
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def main(args): data = Data() dataset = common.Dataset( id=chicago.__name__, name="chicago", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='chicago.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) common.Editor( dataset=dataset, contributor=common.Contributor(id='haspelmath', name='Martin Haspelmath')) DBSession.add(dataset) with UnicodeReader(args.data_file('features.csv')) as reader: features = {id: name for id, name in reader} with UnicodeDictReader(args.data_file('chicago.csv')) as reader: for row in reader: lang = data.add( models.ChicagoLanguage, row['Glotto-code'], id=row['Glotto-code'], name=row['language']) contributor = data['Contributor'].get(row['author']) if contributor is None: contributor = data.add( common.Contributor, row['author'], id=slug(row['author'].split()[-1]), name=row['author']) contrib = common.Contribution( id=lang.id, name='%s structure dataset' % lang.name) common.ContributionContributor(contribution=contrib, contributor=contributor) for k, v in row.items(): if v and k in features: param = data['Parameter'].get(k) if not param: param = data.add(common.Parameter, k, id=k[1:], name=features[k]) for i, (c, icon) in enumerate([ ('a', 'cff0000'), ('b', 'c0000ff'), ('c', 'cffff00'), ('d', 'c00ffff'), ('e', 'cff6600'), ('f', 'c990099'), ('g', 'ccccccc'), ]): deid = '%s-%s' % (param.id, c) data.add( common.DomainElement, deid, id=deid, name=c, parameter=param, number=i + 1, jsondata=dict(icon=icon)) de = data['DomainElement']['%s-%s' % (param.id, v)] vsid = '%s-%s' % (lang.id, param.id) vs = common.ValueSet( id=vsid, language=lang, parameter=param, contribution=contrib) DBSession.add(common.Value(id=vsid, domainelement=de, valueset=vs)) load_families(data, data['ChicagoLanguage'].values())