def testapp(): from webtest import TestApp from clld.db.meta import DBSession, VersionedDBSession, Base from clld.db.models import common from clld_cognacy_plugin.models import Cognateset, Cognate def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': [ 'clld:web/templates', 'clld_cognacy_plugin:templates' ]}) cfg.include('clld.web.app') cfg.include('clld_cognacy_plugin') return cfg.make_wsgi_app() DBSession.remove() VersionedDBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) cs = Cognateset(id='1', name='cs: test') lang = common.Language(id='l', latitude=2, longitude=2) param = common.Parameter(id='l') vs = common.ValueSet(id='vs', language=lang, parameter=param) v = common.Value(id='v', name='abc', valueset=vs) DBSession.add(Cognate(cognateset=cs, counterpart=v)) yield TestApp(wsgi_app)
def add_meta_data(session): """ Creates and adds to the given SQLAlchemy session the common.Dataset and related model instances that comprise the project's meta info. Helper for the main function that keeps the meta data in one place for easier reference and editing. """ dataset = common.Dataset( id='northeuralex', name='NorthEuraLex', description='Lexicostatistical Database of Northern Eurasia', publisher_name= 'Seminar für Sprachwissenschaft at the University of Tübingen', publisher_place='Tübingen', license='https://creativecommons.org/licenses/by-sa/4.0/', jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, contact='*****@*****.**', domain='northeuralex.org') session.add(dataset) dataset.editors.append( common.Editor(contributor=common.Contributor(id='jdellert', name='Johannes Dellert'))) dataset.editors.append( common.Editor( contributor=common.Contributor(id='gjäger', name='Gerhard Jäger')))
def setUp(self): TestWithDb.setUp(self) DBSession.add(common.Dataset(id='d', name='test', domain='localhost')) family = Family(id='f', name='family', description='desc') DBSession.add(LanguageWithFamily(id='l1', family=family)) DBSession.add(LanguageWithFamily(id='l2')) DBSession.flush()
def add_meta_data(session): """ Creates and adds to the given SQLAlchemy session the common.Dataset and related model instances that comprise the project's meta info. Helper for the main function that keeps the meta data in one place for easier reference and editing. """ dataset = common.Dataset( id='tuled', name='TuLeD', description='Tupían Lexical Database', publisher_name= 'Seminar für Sprachwissenschaft at the University of Tübingen', publisher_place='Tübingen', license='https://creativecommons.org/licenses/by-sa/4.0/', jsondata={ 'license_icon': 'cc-by-sa.png', 'license_name': 'Creative Commons Attribution-ShareAlike 4.0 International License' }, contact='*****@*****.**', domain='xyz.org') session.add(dataset) dataset.editors.append( common.Editor(contributor=common.Contributor( id='fgerardi', name='Fabrício Ferraz Gerardi'))) dataset.editors.append( common.Editor(contributor=common.Contributor( id='sreichert', name='Stanislav Reichert')))
def main(args): data = Data() dataset = common.Dataset( id=u'An Crúbadán', name=u'An Crúbadán', publisher_name="Saint Louis University", publisher_place="Saint Louis, USA", publisher_url="http://www.slu.edu/", description= "Linguistic datasets for over 2000 languages created from web-crawled text corpora", contact="*****@*****.**", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'https://licensebuttons.net/l/by/4.0/88x31.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='crubadan.org', ) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Kevin Scannell", id="Kevin Scannell", name="Kevin Scannell", email="*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() fillTable(DBSession)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def testapp(): def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': ['clldmpg:templates', 'clld:web/templates']}) cfg.include('clldmpg') return cfg.make_wsgi_app() DBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) yield ExtendedTestApp(wsgi_app)
def _addDataset(data): """For a lighter 'main' function.""" dataset = common.Dataset( id=dorelld.__name__, name="DoReCo", domain='doreco.info', description=('DoReCo'), # name for citation? published=date(2020, 9, 1), # date contact='*****@*****.**', # mail publisher_name='', publisher_place='', #license='http://creativecommons.org/licenses/by/4.0/', #jsondata={ # 'license_icon': 'cc-by.png', # 'license_name': ('Creative Commons ' + # 'Attribution 4.0 International License') # } ) return dataset
def main(args): data = Data() dataset = common.Dataset(id=moslex.__name__, domain='moslex.clld.org', name='MosLex', license='https://creativecommons.org' '/licenses/by-nc/4.0/', jsondata={ 'license_icon': 'cc-by-nc.png', 'license_name': 'Creative Commons ' 'Attribution-NC 4.0 ' 'International License' }) editor = data.add(common.Contributor, 'editor', id='editor', name='Alexei Kassian') common.Editor(dataset=dataset, contributor=editor) with open('languoids.json') as file: languoids = json.load(file) with open('concepts.json') as file: concepts = json.load(file) with open('forms.json') as file: forms = json.load(file) add_languoids(data, languoids) add_concepts(data, concepts) add_forms(data, forms) DBSession.add(dataset)
def main(args): data = Data() dataset = common.Dataset(id=moslex.__name__, domain='moslex.clld.org', name='MosLex', license='https://creativecommons.org' '/licenses/by-nc/4.0/', jsondata={ 'license_icon': 'cc-by-nc.png', 'license_name': 'Creative Commons ' 'Attribution-NC 4.0 ' 'International License' }) editor = data.add(common.Contributor, 'editor', id='editor', name='Alexei Kassian', email='*****@*****.**') common.Editor(dataset=dataset, contributor=editor) with open(os.environ['MOSLEX_CONCEPTS']) as file: concepts = json.load(file) add_concepts(data, concepts) data_folders = [ path for path in glob.glob(os.path.join(os.environ['MOSLEX_DATA'], '*')) if os.path.isdir(path) ] for folder in data_folders: add_data_folder(folder, data) DBSession.add(dataset)
def main(args): data = Data() lotw_conn = sqlite3.connect("lotw_base.sqlite") lotw_base = lotw_conn.cursor() contrib = common.Contribution(id="initial_contrib", name="Initial contribution") dataset = common.Dataset(id=lotw_dev.__name__, domain='lotw_dev.clld.org', name="Languages of the World", publisher_name="IL RAS", publisher_place="Moscow", publisher_url="http://iling-ran.ru/main/", jsondata={ 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) feature_dict = {} unnamed_feature_count = 0 features = lotw_base.execute("SELECT * FROM Feature").fetchall() names = [y[2] for y in features] feat_name_counts = {x[2]: [names.count(x[2]), 0] for x in features if names.count(x[2]) > 1} # features = [convert_feature(x) for x in features] for feature in features: name = feature[2] # if name == ".О": # continue if name in feat_name_counts.keys(): temp_name = name name += ("_" + str(feat_name_counts[name][1])) feat_name_counts[temp_name][1] += 1 feature_dict[feature[0]] = TreeFeature(pk=feature[0], id=feature[0], name=name, father_pk=feature[5]) print("Added feature %s" % feature[2]) langs = lotw_base.execute("SELECT * FROM Language").fetchall() assert len(set([lang[0] for lang in langs])) == len([lang[0] for lang in langs]) for language in langs: value_sets = [] geodata = lotw_base.execute("SELECT * FROM Geographical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famdata = lotw_base.execute("SELECT * FROM Genealogical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famname = lotw_base.execute("SELECT * FROM Family where Id=?", (famdata[2], )).fetchone()[1] branchname =lotw_base.execute("SELECT * FROM Branch where Id=?", (famdata[3], )).fetchone()[1] if not geodata: geodata = [0.0 for x in range(7)] data.add(lotw_devLanguage, language[0], id=str(language[0]), iso=language[3], family=famname, branch=branchname, name=language[1], latitude=geodata[5], longitude=geodata[6]) print("Added language %s" % language[3]) # Lang_id=language["Lang_id"], Order_of_addition=language["Order_of_addition"], # Sorting_number=language["Sorting_number"], Code_ISO_639_3=language["Code_ISO_639_3"] language_features = lotw_base.execute("SELECT * FROM Binary_data WHERE Lang_id=? AND Feature_value=1", (str(language[0]), )) for l_feat in language_features.fetchall(): feat_id = l_feat[0] try: feat_name = feature_dict[l_feat[2]].name except KeyError: continue vs = common.ValueSet(id=feat_id, language=data["lotw_devLanguage"][language[0]], parameter=feature_dict[l_feat[2]], contribution=contrib) DBSession.add(common.Value(id=feat_id, name=feat_name, valueset=vs)) print("Added value %s" % feat_id) lotw_conn.close()
def main(args): data = Data(created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() #print icons DBSession.execute("delete from Language") DBSession.execute("delete from Unit") DBSession.execute("delete from featuredomain") DBSession.execute("delete from family") DBSession.execute("delete from source") DBSession.execute("delete from parameter") DBSession.execute("delete from feature") DBSession.execute("delete from domainelement") DBSession.execute("delete from valueset") DBSession.execute("delete from value") DBSession.execute("delete from lsivalue") DBSession.execute("delete from dataset") DBSession.execute("delete from contributor") DBSession.execute("delete from lsilanguage") DBSession.execute("delete from contribution") DBSession.execute("delete from designer") DBSession.flush() dtab = partial(_dtab, args.data_file()) #Languages #print args.data_file() #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')] #tabfns = ['nts_18.tab'] ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:] tabfns = os.listdir( '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data' )[1:] #print tabfns args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[ 'language_id'] == '': # to exclude languages which do not have an iso-code continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) ##print ld lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) #lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add(models.lsiLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() #print "I am here" #print data['ntsLanguage'].values()[1].id load_families( data, ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()], [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values() if l.id != '---' and l.id != ''], isolates_icon='tcccccc') #print 'family' #print data['Family'].get('sino1245').jsondata #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): for i, info in enumerate([{ 'designer': 'shafqat', 'domain': '', 'pdflink': '', 'citation': '' }, { 'designer': '-', 'domain': '', 'pdflink': '', 'citation': '' }]): designer_id = str(i + 1) data.add(models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources '''for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush()''' #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [ (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values") ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for _, dfsids in groupby(sorted( (f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids)) assert len(list(dfsids)) == 1 #print 'here is nlgs' for fid, f in fs: #print "lang name" #print ldps #print f.get('feature_possible_values', ""), if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get( "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])} ##vicons['?'] = 'c00ffff' ##vicons['N/A'] = 'c00ffff' ##vicons = icons.iconize(vi.keys()) for (v, desc) in vdesc.items(): #print v,vicons[v] data.add(common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": Colors[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn("Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() glottolog = Glottolog() for ld in ldps: ############################### for printing different map markers for different familys for features:shafqat #print data['Family'] language = data['lsiLanguage'][ld['language_id']] if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) ##ld['value'] = ld['value']+'-'+str(family) ##ld['value'] = combineValueFamily(ld['value'],str(family)) #print family ##################################### parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = (ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error( msg.format( sorted([ y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid'] ]))) ##print msg.format(sorted( ## [y for (x, y) in data['DomainElement'].keys() ## if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, ##contribution=parameter.designer ) #print #print "this one" #print ld['value'],family models.lsiValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={ "icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata, "family": FamilyCodes[str(family)] }, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) '''if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k])''' DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "lsi.cldf") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="LSI", name='Linguistic Survey of India', publisher_name="Sprakbanken", publisher_place="Gothenburg", publisher_url="to be given", description="this is to be followed", domain='http://lsi.clld.org', published=date(2016, 05, 16), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) # disabled for experimental purposes, names were appearing multiple times for i, contributor in enumerate([ common.Contributor(id="Lars Borin", name="Lars Borin", email="*****@*****.**"), common.Contributor(id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**"), common.Contributor(id="Anju Saxena", name="Anju Saxena", email="*****@*****.**"), common.Contributor(id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") ]): #print i common.Editor(dataset=dataset, contributor=contributor, ord=i) '''cont1 = common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") cont2= common.Contributor( id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**") cont3 = common.Contributor( id="Lars Borin", name="Lars Borin", email="*****@*****.**") for contributor in [cont1,cont2,cont3]: common.Editor(dataset=dataset, contributor=contributor,ord=1)''' DBSession.add(dataset) DBSession.flush()
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="{0} {1}".format(glottolog.publication.web.name, version), publisher_name=glottolog.publication.publisher.name, publisher_place=glottolog.publication.publisher.place, publisher_url=glottolog.publication.publisher.url, license=glottolog.publication.license.url, domain=purl.URL(glottolog.publication.web.url).domain(), contact=glottolog.publication.web.contact, jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name}, ) data = Data() for e in glottolog.editors.values(): if e.current: ed = data.add(common.Contributor, e.id, id=e.id, name=e.name) common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord)) DBSession.add(dataset) contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog') DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['hammarstroem'])) # # Add Parameters: # add = functools.partial(add_parameter, data) add('fc', name='Family classification') add('sc', name='Subclassification') add('aes', args.repos.aes_status.values(), name=args.repos.aes_status.__defaults__['name'], pkw=dict( jsondata=dict( reference_id=args.repos.aes_status.__defaults__['reference_id'], sources=[attr.asdict(v) for v in args.repos.aes_sources.values()], scale=[attr.asdict(v) for v in args.repos.aes_status.values()])), dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)), ) add('med', args.repos.med_types.values(), name='Most Extensive Description', dekw=lambda de: dict( name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)), ) add('macroarea', args.repos.macroareas.values(), pkw=dict( description=args.repos.macroareas.__defaults__['description'], jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])), dekw=lambda de: dict( name=de.name, description=de.description, jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)), ), ) add('ltype', args.repos.language_types.values(), name='Language Type', dekw=lambda de: dict(name=de.category, description=de.description), delookup='category', ) add('country', args.repos.countries, dekw=lambda de: dict(name=de.id, description=de.name), ) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) # # Now load languoid data, keeping track of relations that can only be inserted later. # lgsources = defaultdict(list) # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`: nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()]) lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()} for lang in nodemap.values(): for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(glottolog, data, lang, nodemap) for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: mas = [] for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma) mas.append(ma.name) ref.macroareas = ', '.join(mas)
def main(args): data = Data() ds = Pofatu( pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data') dataset = common.Dataset( id=pofatu.__name__, name="POFATU", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='pofatu.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('hermannaymeric', 'Aymeric Hermann'), ('forkelrobert', 'Robert Forkel'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for rec in ds.iterbib(): rec.genre = bibtex.EntryType.from_string( ENTRY_TYPES.get(rec.genre, rec.genre)) if 'date' in rec: rec['year'] = rec.pop('date') data.add(common.Source, rec.id, _obj=bibtex2source(rec, lowercase_id=False)) analyses = list(ds.iterdata()) def midpoint(coords): p = MultiPoint([(lat, lon + 360 if lon < 0 else lon) for lat, lon in coords]).convex_hull #geojson = { # 'type': 'Feature', # 'properties': {}, # 'geometry': mapping(p)} c = p.centroid return c.x, (c.y - 360) if c.y > 180 else c.y artefacts = collections.defaultdict(dict) midpoints = {} for a in analyses: l = a.sample.location lid = l.id if lid not in midpoints: midpoints[lid] = set() if l.latitude is not None and l.longitude is not None: midpoints[lid].add((l.latitude, l.longitude)) art = a.sample.artefact for attr_ in ['name', 'category', 'collection_type']: if not artefacts[slug(art.id)].get(attr_): artefacts[slug(art.id)][attr_] = getattr(art, attr_) midpoints = { k: midpoint(v) if v else (None, None) for k, v in midpoints.items() } for analysis in analyses: loc = analysis.sample.location if loc.id not in data['Location']: data.add( models.Location, loc.id, id=valid_id(loc.id), name=loc.label, latitude=midpoints[loc.id][0], longitude=midpoints[loc.id][1], region=loc.region.replace('_', ' '), subregion=loc.subregion, location=loc.locality, ) # Add contributions for contrib in ds.itercontributions(): contribution = data.add( common.Contribution, contrib.id, id=valid_id(contrib.id), name=contrib.label, description=contrib.description, ) DBSession.flush() for i, name in enumerate(contrib.contributors): cid = slug(name) co = data['Contributor'].get(cid) if not co: co = data.add(common.Contributor, cid, id=cid, name=name) common.ContributionContributor(ord=i, contribution=contribution, contributor=co) for ref in contrib.source_ids: DBSession.add( common.ContributionReference( contribution=contribution, source=data['Source'][ref], )) data['Contribution'][ref] = contribution methods = collections.defaultdict(list) for method in ds.itermethods(): m = data.add( models.Method, method.id, id=valid_id(method.id), name=method.label, code=method.code, parameter=method.parameter.strip(), instrument=method.instrument, number_of_replicates=method.number_of_replicates, date=method.date, comment=method.comment, detection_limit=method.detection_limit, detection_limit_unit=method.detection_limit_unit, total_procedural_blank_value=method.total_procedural_blank_value, total_procedural_unit=method.total_procedural_unit, ) methods[(m.code.lower(), m.parameter.lower())].append(m) for ref in method.references: DBSession.add( models.MethodReference( method=m, sample_name=ref.sample_name, sample_measured_value=ref.sample_measured_value, uncertainty=ref.uncertainty, uncertainty_unit=ref.uncertainty_unit, number_of_measurements=ref.number_of_measurements, )) for ref in method.normalizations: DBSession.add( models.Normalization( method=m, reference_sample_name=ref.reference_sample_name, reference_sample_accepted_value=ref. reference_sample_accepted_value, citation=ref.citation, )) parameter = data.add(common.Parameter, 'c', id='category', name='Sample category') for i, opt in enumerate(attr.fields_dict( pypofatu.models.Sample)['sample_category'].validator.options, start=1): data.add(common.DomainElement, opt, parameter=parameter, id=str(i), name=opt) DBSession.flush() assert parameter.pk # Add Samples and UnitParameters and Measurements for analysis in analyses: sample = analysis.sample vsid = '{0}-{1}'.format(sample.location.id, data['Contribution'][sample.source_id].id) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=valid_id(vsid), language_pk=data['Location'][sample.location.id].pk, parameter_pk=parameter.pk, contribution_pk=data['Contribution'][sample.source_id].pk, ) v = data['Sample'].get(sample.id) if not v: v = data.add( models.Sample, sample.id, id=valid_id(sample.id), name=sample.id, sample_name=sample.sample_name, sample_comment=sample.sample_comment, petrography=sample.petrography, latitude=sample.location.latitude, longitude=sample.location.longitude, elevation=sample.location.elevation, location_comment=sample.location.comment, site_name=sample.site.name, site_code=sample.site.code, site_context=sample.site.context, site_comment=sample.site.comment, site_stratigraphic_position=sample.site.stratigraphic_position, site_stratigraphy_comment=sample.site.stratigraphy_comment, domainelement=data['DomainElement'][sample.sample_category], valueset=vs, artefact_id=sample.artefact.id, artefact_name=sample.artefact.name, artefact_category=sample.artefact.category, artefact_comment=sample.artefact.comment, artefact_attributes=sample.artefact.attributes, artefact_collector=sample.artefact.collector, artefact_collection_type=sample.artefact.collection_type, artefact_collection_location=sample.artefact. collection_location, artefact_collection_comment=sample.artefact.collection_comment, artefact_fieldwork_date=sample.artefact.fieldwork_date, ) DBSession.add( models.SampleReference( description='sample', sample=v, source=data['Source'][sample.source_id])) for ref in sample.artefact.source_ids: DBSession.add( models.SampleReference(description='artefact', sample=v, source=data['Source'][ref])) for ref in sample.site.source_ids: DBSession.add( models.SampleReference(description='site', sample=v, source=data['Source'][ref])) a = data.add( models.Analysis, analysis.id, id=better_slug(analysis.id), name=analysis.id, sample=v, ) for i, measurement in enumerate(analysis.measurements): if i == 0: method = measurement.method if method: a.analyzed_material_1 = method.analyzed_material_1, a.analyzed_material_2 = method.analyzed_material_2, a.sample_preparation = method.sample_preparation, a.chemical_treatment = method.chemical_treatment, a.technique = method.technique, a.laboratory = method.laboratory, a.analyst = method.analyst, pid = slug(measurement.parameter, lowercase=False) p = data['Param'].get(pid) if not p: p = data.add(models.Param, pid, id=pid, name=measurement.parameter) data.add( models.Measurement, None, id='{0}-{1}'.format(a.id, p.id), analysis=a, method=data['Method'].get(measurement.method.id) if measurement.method else None, value=measurement.value, less=measurement.less, precision=measurement.value_sd, sigma=measurement.sd_sigma, unitparameter=p, )
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def main(args): # pragma: no cover data = Data() clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data' clts_repos = CLTS(clts_repos) print(clts_repos.repos) version = 'v2.1.0' # assert_release(clts_repos.repos) for rec in Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id='clts', name="CLTS {0}".format(version), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate([ 'Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Robert Forkel', ]): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for line in args.cldf['data/features.tsv']: data.add( models.Feature, line['ID'], id=line['ID'], name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']), sound_type=line['TYPE'], feature=line['FEATURE'], value=line['VALUE'], ) DBSession.add(models.SoundSegment( id='NA', name='<NA>', description='<NA>', type='marker', generated=True, unicode='', color='#bbbbbb', )) for line in args.cldf['data/sounds.tsv']: s = data.add( models.SoundSegment, line['ID'], id=line['ID'], name=line['GRAPHEME'], description=line['NAME'], type=line['TYPE'], generated=line['GENERATED'], unicode=' / '.join(line['UNICODE']), color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']), ) if s.color == '0': s.color = '#bbbbbb' assert s.color in LEGEND DBSession.flush() seen = set() for line in args.cldf['data/sounds.tsv']: for fid in line['FEATURES']: spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk if (spk, fpk) not in seen: DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk)) seen.add((spk, fpk)) english = data.add( common.Language, 'eng', id='eng', name='English') for line in args.cldf['sources/index.tsv']: c = data.add( models.Transcription, line['NAME'], id=line['NAME'], name=line['NAME'], description=line['DESCRIPTION'].replace(':bib:', '/sources/'), datatype=getattr(models.Datatype, line['TYPE']) ) for ref in line.get('REFS', []): common.ContributionReference(source=data['Source'][ref], contribution=c) sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl for line in args.cldf['data/graphemes.tsv']: key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME'] if key not in data['Grapheme']: sound_id = line['NAME'].replace(' ', '_') vs = data['ValueSet'].get((line['DATASET'], line['NAME'])) if not vs: try: vs = data.add( common.ValueSet, (line['DATASET'], line['NAME']), id=key, description=line['NAME'], language=english, contribution=data['Transcription'][line['DATASET']], parameter=data['SoundSegment'][sound_id] ) except: print(line) raise data.add( models.Grapheme, key, id=key, name=line['GRAPHEME'], description=line['NAME'], url=line['URL'].unsplit() if line['URL'] else None, audio=sound_url_template.expand(line) if line['SOUND'] else None, image=image_url_template.expand(line) if line['IMAGE'] else None, valueset=vs )
def main(args): # pragma: no cover data = Data() print("Setting up dataset…") dataset = common.Dataset( id=cariban.__name__, domain="cariban.clld.org", name="Comparative Cariban Database", description="Comparative Cariban Database", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_url="https://www.eva.mpg.de", publisher_place="Leipzig", license="https://creativecommons.org/licenses/by/4.0/", contact="*****@*****.**", jsondata={'function_paradigms': []}, ) fps = [] for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: for cons in morph_func["Construction"]: fps.append({ 'Function': function, 'Construction': cons, 'Morpheme': morph_func['Morpheme']}) dataset.update_jsondata(function_paradigms=fps) DBSession.add(dataset) DBSession.flush() print("Adding contributors…") c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/") dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True)) print("Adding languages…") dialect_mapping = {} lang_shorthands = {} glottocodes = {} lang_ids = {} for lang in args.cldf["LanguageTable"]: if lang["Sampled"] == "y": language = data.add( common.Language, lang["ID"], id=lang["ID"], name=lang["Name"], latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None, longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None, jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']}, ) add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"]) if lang["Dialect_Of"] not in [None, "y"]: dialect_mapping[lang["ID"]] = lang["Dialect_Of"] lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]} glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} def get_lang_id(key): if key in lang_shorthands: lang_id = lang_shorthands[key]["ID"] elif key in glottocodes: lang_id = glottocodes[key]["ID"] elif key in lang_ids: lang_id = key else: print("Could not identify language %s" % key) return None if lang_id in dialect_mapping: lang_id = dialect_mapping[lang_id] return lang_id def get_key_and_page(source_string): if len(source_string.split("[")) > 1: bib_key = source_string.split("[")[0] pages = source_string.split("[")[1].split("]")[0] else: bib_key = source_string pages = "" return bib_key, pages print("Adding sources…") for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) print("Adding language sources…") DBSession.flush() for rec in bibtex.Database.from_file(args.cldf.bibpath): if "keywords" in rec: for keyword in rec["keywords"].split(","): if keyword in lang_shorthands: lang_id = get_lang_id(keyword.strip(" ")) if lang_id in data["Language"]: data.add(common.LanguageSource, rec.id+lang_id, language_pk=data["Language"][lang_id].pk, source_pk=data["Source"][rec.id].pk ) data.add( common.Source, "pc", id="pc", name="Personal communication", description="Placeholder for data obtained from personal communication.", bibtex_type=bibtex.EntryType.misc ) # print("Adding glossing abbreviations…") # length = len(pynterlinear.get_all_abbrevs().keys()) # for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()): # print("%s/%s" % (i+1, length), end="\r") # DBSession.add(common.GlossAbbreviation(id=key, name=name)) # print("") # print("Adding examples…") gloss_replacements = { "S_A_": "Sa", "S_P_": "Sp" } def clldify_glosses(gloss_line): for orig, new in gloss_replacements.items(): gloss_line = gloss_line.replace(orig,new) gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line) return gloss_line for ex in args.cldf["ExampleTable"]: lang_id = get_lang_id(ex["Language_ID"]) new_ex = data.add(common.Sentence, ex["ID"], id=ex["ID"], name=ex["Name"], description=ex["Translated_Text"], analyzed="\t".join(ex["Analyzed_Word"]), gloss=clldify_glosses("\t".join(ex["Gloss"])), language=data["Language"][lang_id], comment=ex["Comment"], markup_gloss="\t".join(ex["Morpheme_IDs"]) ) if ex["Source"]: bib_key, pages = get_key_and_page(ex["Source"]) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(common.SentenceReference( sentence=new_ex, source=source, key=source.id, description=pages.replace("--","–")) ) def add_morpheme_reference(morpheme, source_string): bib_key, pages = get_key_and_page(source_string) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.MorphemeReference( morpheme=morpheme, source=source, key=source.id, description=pages.replace("--","–") ) ) print("Adding morphemes…") for morph in args.cldf["FormTable"]: lang_id = get_lang_id(morph["Language_ID"]) form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ") new_morph = data.add(models.Morpheme, morph["ID"], morpheme_type="grammatical", language=data["Language"][lang_id], name="/".join(form), id=morph["ID"], ) if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0]) print("Adding constructions…") data.add(models.DeclarativeType, "imp", id="imp", name="imperative") data.add(models.DeclarativeType, "decl", id="decl", name="declarative") data.add(models.MainClauseVerb, "y", id="y", name="main clause construction") data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction") for cons in args.cldf["ParameterTable"]: lang_id = get_lang_id(cons["Language_ID"]) new_construction = data.add( models.Construction, cons["ID"], id=cons["ID"], language=data["Language"][lang_id], name=cons["Description"], mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]], ) if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]] def add_morph_func(morpheme, func_key, construction): data.add(models.MorphemeFunction, "%s:%s" % (morpheme, function), id="%s:%s" % (morpheme, func_key), name="MorphemeFunction %s:%s"% (morpheme, func_key), unit=data["Morpheme"][morpheme], unitparameter=data["Meaning"][function], construction=construction ) print("Adding morpheme functions…") for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: func_key = function.replace(".","_") if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"): meaning_type="inflectional" else: meaning_type="derivational" if function not in data["Meaning"]: data.add(models.Meaning, function, id=func_key, name=function, meaning_type=meaning_type ) #Only some morpheme functions are specified as occurring in specific constructions if len(morph_func["Construction"]) == 0: for morpheme in morph_func["Morpheme"]: add_morph_func(morpheme, func_key, None) else: for construction in morph_func["Construction"]: if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?": for morpheme in morph_func["Morpheme"]: if data["Morpheme"][morpheme].language != data["Construction"][construction].language: print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % ( data["Morpheme"][morpheme].language, data["Morpheme"][morpheme], data["Construction"][construction].language, data["Construction"][construction] ) ) cons_func_key = func_key + ":" + construction add_morph_func(morpheme, cons_func_key, data["Construction"][construction]) print("Checking examples for illustrated morphemes…") proto_languages = ["pc"] is_illustrated = {} for key, row in data["MorphemeFunction"].items(): if row.unit.language.id in proto_languages: continue is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False for row in args.cldf["ExampleTable"]: for word in row["Morpheme_IDs"]: morph_ids = util.split_word(word) for unit_value in morph_ids: if unit_value in ["X","-","=", "~"]: continue unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"]) if unitvaluesentence_key in data["UnitValueSentence"].keys(): continue is_illustrated[unit_value] = True morph_id = unit_value.split(":")[0] if morph_id not in data["Morpheme"].keys(): print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id)) elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language: print("Warning: The %s example %s claims to contain the %s morpheme %s." % ( data["Sentence"][row["ID"]].language, row["ID"], data["Morpheme"][morph_id].language, data["Morpheme"][morph_id] ) ) if ":" not in unit_value: print("%s in %s contains no defined function!" % (unit_value, row["ID"])) function = unit_value.split(":")[1] morph_function_id = "%s:%s" % (morph_id, function) if morph_function_id not in data["MorphemeFunction"].keys(): print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-"))) continue data.add(models.UnitValueSentence, unitvaluesentence_key, sentence=data["Sentence"][row["ID"]], unitvalue=data["MorphemeFunction"][morph_function_id], ) # see how many morpheme functions are illustrated with example sentences good_ill = [key for key, value in is_illustrated.items() if value] not_ill = [key for key, value in is_illustrated.items() if not value] not_ill.sort() cov = len(good_ill)/len(is_illustrated)*100 print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2))) f = open("../unillustrated_morphemes.txt", "w") for morph in not_ill: f.write(morph+"\n") f.close() print("Adding cognate sets…") for cogset in args.cldf["CognatesetTable"]: new_cset = data.add(models.Cognateset, cogset["ID"], id=cogset["ID"], name=cogset["Name"], description=cogset["Function"], cogset_type="grammatical" ) if cogset["Source"]: for source in cogset["Source"]: bib_key, pages = get_key_and_page(source) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=new_cset, source=source, key=source.id, description=pages) ) print("Adding cognates…") for morph in args.cldf["FormTable"]: for cognate_ID in morph["Cognateset_ID"]: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=data["Morpheme"][morph["ID"]] ) ) print("Adding morpheme comments…") for row in args.cldf["FormTable"]: data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"]) print("Adding construction descriptions…") for cons in args.cldf["ParameterTable"]: if cons["Comment"] is None: description = "" else: description = util.generate_markup(cons["Comment"]) description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"])) description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"])) data["Construction"][cons["ID"]].markup_description = description print("Adding cognate set descriptions…") for cogset in args.cldf["CognatesetTable"]: data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"]) # if cogset["ID"] == "13pro": # data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup( # util.comparative_function_paradigm( # ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"], # "1+3 scenarios", # ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"])) def add_tree_labels(phylo): uncertain_nodes = [] for node in phylo.find_clades(): if node.name == None or not node.is_terminal(): continue plain_name = node.name.replace("?","") if "?" in node.name: uncertain_nodes.append(plain_name) if plain_name in lang_ids: node.name = lang_ids[plain_name]["Name"].replace("'", "’") if plain_name in uncertain_nodes: node.name += "?" return phylo, uncertain_nodes print("Adding trees…") own_trees = ["matter"] tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw') newick_files = {} for tree in args.cldf["cariban_trees.csv"]: if tree["ID"] in own_trees: continue newick_files[tree["ID"]] = { "orig": tree["ID"]+"_orig.newick", "norm": tree["ID"]+"_norm.newick", "source": tree["Source"], "comment": tree["Comment"], "o_comment": tree["Orig_Comment"] } #adding my own trees separately. for my_tree_count, tree_id in enumerate(own_trees): my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick") my_tree, uncertain_nodes = add_tree_labels(my_tree) edited_tree = io.StringIO() Phylo.write(my_tree, edited_tree, "newick") tree = edited_tree.getvalue().replace(":0.00000","") my_phylo = Phylogeny( tree_id, id=tree_id, name="Matter (2020)",# % str(my_tree_count+1), newick=tree, markup_description="My own, conservative, classification." ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=my_phylo ) ) DBSession.add(my_phylo) #adding the other trees for tree_id, values in newick_files.items(): norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick") orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick") norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree) edited_tree = io.StringIO() Phylo.write(norm_biotree, edited_tree, "newick") norm_tree = edited_tree.getvalue().replace(":0.00000","") edited_tree = io.StringIO() Phylo.write(orig_biotree, edited_tree, "newick") orig_tree = edited_tree.getvalue().replace(":0.00000","") norm_phylo = Phylogeny( id=tree_id+"_norm", name=str(data["Source"][values["source"]]) + " (Normalized)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] ), newick=norm_tree ) if values["o_comment"] == None: o_comment = "" else: o_comment = values["o_comment"] orig_phylo = Phylogeny( id=tree_id+"_orig", name=str(data["Source"][values["source"]]) + " (Original)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] + " " + o_comment ), newick=orig_tree ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=norm_phylo ) ) DBSession.add(norm_phylo) DBSession.add(orig_phylo) print("Adding t-adding verb cognate sets…") for t_verb_set in args.cldf["cariban_t_cognates.csv"]: cognate_ID = "t"+t_verb_set["ID"] rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"]) t_cogset = data.add(models.Cognateset, cognate_ID, id=cognate_ID, name=rec_t_form, description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"], cogset_type="t_adding" ) if t_verb_set["Source"]: bib_key = t_verb_set["Source"].split("[")[0] if len(t_verb_set["Source"].split("[")) > 1: pages = t_verb_set["Source"].split("[")[1].split("]")[0] else: pages = " " if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=t_cogset, source=source, key=source.id, description=pages) ) print("Adding t-adding verbs…") t_langs = {} t_verbs = {} non_t_adding_lgs = ["ing","mac","kar","wmr","pan"] data.add(models.Meaning, "t_verb", id="t-verb", name="t-adding verb", ) for t_verb_entry in args.cldf["cariban_t_verbs.csv"]: if t_verb_entry["Language_ID"] == "cari1283": continue cognate_ID = "t"+t_verb_entry["Cognateset_ID"] lang_id = get_lang_id(t_verb_entry["Language_ID"]) morph_id = lang_id+"_"+cognate_ID if morph_id in data["Morpheme"].keys(): if morph_id + "_2" in data["Morpheme"].keys(): morph_id += "_3" else: morph_id += "_2" t_verb = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="t_adding", name=t_verb_entry["Form"], language=data["Language"][lang_id], ) DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=t_verb ) ) if t_verb_entry["t"] == "y": t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name) t_verb.markup_description = util.generate_markup("Shows cogset:t") if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs: t_verb.name = "[t-?]"+t_verb.name t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t") if t_verb_entry["t"] == "n": t_verb.markup_description = util.generate_markup("Does not show cogset:t") if lang_id not in t_langs.keys(): t_langs[lang_id] = {"y": 0, "n": 0, "?": 0} if cognate_ID not in t_verbs.keys(): t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0} t_langs[lang_id][t_verb_entry["t"]] += 1 if lang_id not in non_t_adding_lgs: t_verbs[cognate_ID][t_verb_entry["t"]] += 1 if t_verb_entry["Source"]: add_morpheme_reference(t_verb, t_verb_entry["Source"]) data.add(models.MorphemeFunction, "t_"+t_verb_entry["ID"], id="t_"+t_verb_entry["ID"], name="t-Verb %s" % t_verb_entry["Parameter_ID"], unit=t_verb, unitparameter=data["Meaning"]["t_verb"], construction=None ) for lang, values in t_langs.items(): data["Language"][lang].update_jsondata(t_values=values) for verb, values in t_verbs.items(): # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"])) data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))) print("Adding reconstructed lexemes…") proto_forms = {} for cogset in args.cldf["cariban_lexical_reconstructions.csv"]: proto_forms[cogset["ID"]] = cogset["Form"] first_found = [] for entry in args.cldf["cariban_swadesh_list.csv"]: cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"] if cognateset_ID not in data["Cognateset"]: if cognateset_ID in proto_forms: form = "*" + proto_forms[cognateset_ID].replace("; ", " / ") # else: # form = "" data.add(models.Cognateset, cognateset_ID, id=cognateset_ID, name=form, description=cognateset_ID, cogset_type="lexical" ) lang_id = get_lang_id(entry["Language_ID"]) if lang_id not in data["Language"]: continue function = entry["Parameter_ID"].replace(".","_") morph_id = entry["Language_ID"] + "_" + function if morph_id in first_found: continue first_found.append(morph_id) if function not in data["Meaning"].keys(): data.add(models.Meaning, function, id=function, name=function, meaning_type="lexical" ) morpheme = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="lexical", name=entry["Value"][0], language=data["Language"][lang_id], ) data.add(models.MorphemeFunction, "%s:%s" % (morph_id, function), id="%s:%s" % (morph_id, function), name="MorphemeFunction %s:%s"% (morph_id, function), unit=data["Morpheme"][morph_id], unitparameter=data["Meaning"][function], construction=None ) if entry["Source"]: add_morpheme_reference(morpheme, entry["Source"]) if cognateset_ID in proto_forms: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognateset_ID], counterpart=morpheme ) )
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def load(args): fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(args.args[0]), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('bank', 'Sebastian Bank'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) glottolog = args.repos for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def main(args): data = Data() print(args.data_file('x')) dataset = common.Dataset( id=grammaticon.__name__, name="Grammaticon", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grammaticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, ed in enumerate(['Martin Haspelmath', 'Robert Forkel']): common.Editor(dataset=dataset, contributor=get_contributor(data, ed), ord=i + 1) eng = data.add(common.Language, 'eng', name='English') for obj in reader(args.data_file('Feature_lists.csv'), dicts=True): contrib = data.add( models.Featurelist, obj['id'], id=slug(obj['name']), name=obj['name'], year=obj['year'], number_of_features=int(obj['number of features']) if obj['number of features'] else None, url=obj['year']) if obj['authors']: for i, author in enumerate(obj['authors'].split(',')): common.ContributionContributor( contribution=contrib, contributor=get_contributor(data, author), ord=i + 1) #id,name,feature_area for name, objs in itertools.groupby( sorted(reader(args.data_file('Metafeatures.csv'), dicts=True), key=lambda i: i['name']), lambda i: i['name']): dbobj = None for obj in objs: if not dbobj: dbobj = data.add( models.Metafeature, obj['id'], id=slug(obj['id']), name=obj['name'], area=obj['feature_area']) else: data['Metafeature'][obj['id']] = dbobj DBSession.flush() #feature_ID,feature name,feature description,meta_feature_id,collection_id,collection URL,collection numbers for obj in reader(args.data_file('Features.csv'), dicts=True): if int(obj['collection_id']) == 8: obj['collection_id'] = '1' if (not obj['meta_feature_id']): #or obj['meta_feature_id'] in ('89'): print('skipping: {}'.format(obj)) continue vsid = (data['Featurelist'][obj['collection_id']].pk, data['Metafeature'][obj['meta_feature_id']].pk) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='{0}-{1}'.format(*vsid), contribution=data['Featurelist'][obj['collection_id']], parameter=data['Metafeature'][obj['meta_feature_id']], language=eng) models.Feature( valueset=vs, id=slug(obj['feature_ID']), name=obj['feature name'], description=obj['feature description']) for obj in reader(args.data_file('Concepts.csv'), dicts=True): data.add( models.Concept, obj['id'], id=obj.pop('id'), name=obj.pop('label'), description=obj.pop('definition'), **{k.replace(' ', '_'): v for k, v in obj.items()}) for obj in reader(args.data_file('Concepts_metafeatures.csv'), dicts=True): if obj['meta_feature__id'] in ('89',): print('skipping: {}'.format(obj)) continue if obj['concept_id'] and obj['meta_feature__id']: models.ConceptMetafeature( concept=data['Concept'][obj['concept_id']], metafeature=data['Metafeature'][obj['meta_feature__id']]) for obj in reader(args.data_file('Concepthierarchy.csv'), dicts=True): child = data['Concept'].get(obj['concept_id']) if child: parent = data['Concept'].get(obj['concept_parent_id']) if parent: DBSession.add(models.ConceptRelation(parent=parent, child=child))
def main(args): data = Data() for rec in Database.from_file( data_path('references.bib'), lowercase=False): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id=clts.__name__, name="CLTS", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Thiago Chacon', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t', namedtuples=True)): if not i % 100: print('-', end="") key = line.NAME.replace(' ', '_') data.add( models.SoundSegment, key, id=key, name = line.NAME, grapheme=line.GRAPHEME, aliases=line.ALIASES, representation=len(line.REFLEXES.split(',')), reflexes = line.REFLEXES, generated = True if line.GENERATED else False, unicode = line.UNICODE, ) print('') english = data.add( common.Language, 'eng', id='eng', name='English') contributions = {} for line in reader(data_path('datasets.tsv'), delimiter='\t', namedtuples=True): contributions[line.NAME] = data.add( models.CLTSDataSet, line.NAME, id=line.NAME, name=line.NAME, description=line.DESCRIPTION, datatype=line.TYPE ) for id_ in line.REFS.split(', '): common.ContributionReference( source=data['Source'][id_], contribution=contributions[line.NAME]) visited = set() for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t", namedtuples=True)): if not i % 100: print('-', end='') key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME if key not in visited: sound_id = line.NAME.replace(' ', '_') vs = common.ValueSet( id=key, description=line.NAME, language=english, contribution=contributions[line.DATASET], parameter=data['SoundSegment'][sound_id] ) data.add( models.Grapheme, key, id=key, grapheme=line.GRAPHEME, bipa_grapheme=line.BIPA, name=line.NAME, dataset=line.DATASET, datatype=line.DATATYPE, frequency=line.FREQUENCY or 0, image=line.IMAGE, url=line.URL, valueset=vs ) visited.add(key) print('-')
def main(args): data = Data() dataset = common.Dataset( id=plld_app.__name__, name=plld_app.__name__, domain='plld.clld.org', description="Database of Papuan Language and Culture", ) DBSession.add(dataset) # Load the list of languages languages = pandas.ExcelFile( os.path.join(DBPATH, "Languages and Coordinates.xlsx")).parse(0) parameters = {} for i, language in languages.iterrows(): # Generate the database object for each language print("\nCreating language", language['Language name (-dialect)']) lang = models.Lect( id=((language['Language name (-dialect)'].lower()[:4] + "x" + newid()) if pandas.isnull(language['Glottolog']) else language['Glottolog'].strip()), region=language['Region'], family=language['Family'], name=language['Language name (-dialect)'].strip(), latitude=language['Lat'], longitude=language['Lon']) # Check what data files we have that say they are about that language. if pandas.isnull(language['ISO_code']): # The convention for file-names of varieties without iso # code is ad-hoc, skip those until we have established a # good convention. files_concerning = [ file for file in os.listdir(DBPATH) if file.lower().startswith(language['Internal'].lower() + '_') ] else: # Otherwise, the convention is that languages are # described by files starting with their iso code and an # underscore. files_concerning = [ file for file in os.listdir(DBPATH) if file.lower().startswith(language['ISO_code'].lower() + '_') ] # For each such language, we might have typological, sociolinguistic and lexical (small vocabulary or big vocabulary or kinship terms) data. Deal with them in order. # Try to load the corresponding typology questionnaire typology_files = [ file for file in files_concerning if 'typolog' in file.lower() ] if len(typology_files) == 1: try: add_typological_data(typology_files[0], parameters, lang) print("Typological data read.") except UnexpectedTypologyFormatError: print( "File", typology_files[0], "had an unexpected format for a typology questionnaire!") else: print("There were not one, but", len(typology_files), "possible questionnaires.") # Try to load the corresponding cultural features questionnaire culture_files = [ file for file in files_concerning if 'cult' in file.lower() ] if len(culture_files) == 1: try: add_cultural_data(culture_files[0], parameters, lang) print("Cultural data read.") except UnexpectedCultureFormatError: print("File", culture_files[0], "had an unexpected format for a culture questionnaire!") else: print("There were not one, but", len(culture_files), "possible questionnaires.")
def main(args): _ = args data = Data() cldf_data = args.cldf data.add(common.Contributor, 'fehnannemarie', id='fehnannemarie', name="Anne-Marie Fehn", url="https://shh.mpg.de") # TODO: Editors/Contributors dataset = common.Dataset(id=kba.__name__, name="KBA", publisher_name="Max Planck Institute for the " "Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by" "/4.0/", domain='kba.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons ' 'Attribution 4.0 ' 'International ' 'License' }) DBSession.add(dataset) for i, editor in enumerate(['fehnannemarie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for language in cldf_data['LanguageTable']: lang = data.add(models.KbaLanguage, language['ID'], id=language['ID'], name=language['Name']) add_language_codes(data, lang, None, glottocode=language['Glottocode']) # TODO: Concepticon for parameter in cldf_data['ParameterTable']: data.add(common.Parameter, parameter['ID'], id=parameter['ID'], name='{0} ({1})'.format(parameter['Name'], parameter['ID'])) for form in cldf_data['FormTable']: valueset_id = '{0}-{1}'.format(form['Parameter_ID'], form['Language_ID']) valueset = data['ValueSet'].get(valueset_id) # Unless we already have something in the VS: if not valueset: valueset = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data['KbaLanguage'][form['Language_ID']], parameter=data['Parameter'][form['Parameter_ID']], contribution=contrib) DBSession.add( models.Word(id=form['ID'], name=form['Form'], comment=form.get('Comment'), sourceorthography=form.get('sourceorthography'), kbaorthography=form.get('kbaorthography'), wordclass=form.get('wordclass'), grammaticalnotes=form.get('grammaticalnotes'), idiolectalvariant=form.get('idiolectalvariant'), originaltranslation=form.get('originaltranslation'), valueset=valueset)) load_families(data, [(l.glottocode, l) for l in data['KbaLanguage'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc')
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) for i, name in enumerate( ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add(common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file(data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add( common.Source_files(mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add(models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add( models.Relation(source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference(source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add(common.Contributor, name, id=slug(name), name=name) DBSession.add( common.ContributionContributor(ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split( ): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add( common.Contribution_data(object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider(id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta(metaprovider=provider, conceptset=data['ConceptSet'] [meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('moselulrike', 'Ulrike Mosel'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: continue if not md['date_published']: continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: #if submission.id != 'sidaama': # continue transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog3/glottolog')
def main(args): data = Data() dataset = common.Dataset( id=cobl2.__name__, name="IE-CoR", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='iecor.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Heggarty', None), ('Anderson', None), ('Scarborough', None)]) for row in sorted(ds['authors.csv'], key=lambda x: [ x['Last_Name'].lower(), x['First_Name'].lower()]): if row['Last_Name'] in editors: editors[row['Last_Name']] = row['ID'] data.add( models.Author, row['ID'], id=row['ID'], name='{0} {1}'.format(row['First_Name'], row['Last_Name']), url=row['URL'], photo=data_uri(photos[row['Last_Name']], 'image/jpg') if row['Last_Name'] in photos else None) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Author'][cid], ord=i + 1) for src in ds.sources.items(): for invalid in ['isbn', 'part', 'institution']: if invalid in src: del src[invalid] data.add( common.Source, src.id, id=src.id, name=src.get('author', src.get('editor')), description=src.get('title', src.get('booktitle')), bibtex_type=getattr(EntryType, src.genre, EntryType.misc), **src) re_links = re.compile(r'\[(?P<label>[^\]]+?)\]\((?P<type>.+?)-(?P<id>\d+)\)') link_map = { 'cog': '/cognatesets/', 'lex': '/values/', 'src': '/sources/', } def parse_links(m): try: return '<a href="{}{}">{}</a>'.format( link_map[m.group('type')], m.group('id'), m.group('label')) except KeyError: print("parse_links: type error in '{}'".format(":".join(m.groups()))) return '[{}]({}-{})'.format(m.group('label'), m.group('type'), m.group('id')) for param in ds['ParameterTable']: data.add( models.Meaning, param['ID'], id=slug(param['Name']), name=param['Name'], description_md=param['Description_md'], concepticon_id=int(param['Concepticon_ID']) if param['Concepticon_ID'] != '0' else None, ) for row in ds['clades.csv']: data.add( models.Clade, row['ID'], id=row['ID'], level0_name=row['level0_name'], level1_name=row['level1_name'], level2_name=row['level2_name'], level3_name=row['level3_name'], clade_level0=row['clade_level0'], clade_level1=row['clade_level1'], clade_level2=row['clade_level2'], clade_level3=row['clade_level3'], clade_name=row['clade_name'], short_name=row['short_name'], color=row['color'], ) for row in ds['LanguageTable']: c = data.add( common.Contribution, row['ID'], id=row['ID'], name=row['Name'], ) for i, cid in enumerate(row['Author_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['Author'][cid], ord=i + 1)) data.add( models.Variety, row['ID'], id=slug(row['Name']), name=row['Name'], latitude=float(row['Latitude']) if row['Latitude'] is not None else None, longitude=float(row['Longitude']) if row['Longitude'] is not None else None, contribution=c, color=rgb_as_hex(row['Color']), clade=', '.join(filter(None, row['Clade'])), clade_name=row['clade_name'], glottocode=row['Glottocode'], historical=row['historical'], distribution=row['distribution'], logNormalMean=row['logNormalMean'], logNormalOffset=row['logNormalOffset'], logNormalStDev=row['logNormalStDev'], normalMean=row['normalMean'], normalStDev=row['normalStDev'], ascii_name=row['ascii_name'], iso=row['ISO639P3code'], lang_description=row['Description'], variety=row['Variety'], loc_justification=row['loc_justification'] or None, sort_order=row['sort_order'] ) vsrs = set() for row in ds['FormTable']: vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID'])) if not vs: vs = data.add( common.ValueSet, (row['Language_ID'], row['Parameter_ID']), id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']), language=data['Variety'][row['Language_ID']], parameter=data['Meaning'][row['Parameter_ID']], contribution=data['Contribution'][row['Language_ID']], ) v = data.add( models.Lexeme, row['ID'], id=row['ID'], name=row['Form'], native_script=row['native_script'], phonetic=row['phon_form'], phonemic=row['Phonemic'], comment=re_links.sub(parse_links, row['Comment'] or ''), url=row['url'], gloss=row['Gloss'], valueset=vs ) for src in row['Source']: sid, pages = ds.sources.parse(src) key = (vs.id, sid, pages) if pages: pages = pages.replace('|', ';') if key not in vsrs: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][sid], description=pages)) vsrs.add(key) for row in ds['CognatesetTable']: cc = data.add( models.CognateClass, row['ID'], id=row['ID'], name=row['ID'], root_form=row['Root_Form_calc'] if row['Root_Form_calc'] is not None and len(row['Root_Form_calc']) else row['Root_Form'], root_form_calc=row['Root_Form_calc'] or None, root_gloss=row['Root_Gloss'] or None, root_language=row['Root_Language_calc'] if row['Root_Language_calc'] is not None and len(row['Root_Language_calc']) else row['Root_Language'], root_language_calc=row['Root_Language_calc'] or None, comment=re_links.sub(parse_links, row['Comment'] or ''), justification=re_links.sub(parse_links, row['Justification'] or ''), ideophonic=row['Ideophonic'] or None, parallel_derivation=row['parallelDerivation'] or None, revised_by=','.join(row['revised_by']) or None, superset_id=int(row['supersetid']) if row['supersetid'] else None, ) for src in row['Source']: sid, pages = ds.sources.parse(src) if pages: pages = pages.replace('|', ';') DBSession.add(clld_cognacy_plugin.models.CognatesetReference( cognateset=cc, source=data['Source'][sid], description=pages)) DBSession.flush() cc_id_pk_map = {str(ccid): cc.pk for ccid, cc in data['CognateClass'].items()} for row in ds['CognatesetTable']: if row['proposedAsCognateTo_pk']: DBSession.add(models.ProposedCognates( cc1_pk=data['CognateClass'][row['ID']].pk, cc2_pk=cc_id_pk_map[str(row['proposedAsCognateTo_pk'])], scale=row['proposedAsCognateToScale'] )) DBSession.flush() loans = {ln['Cognateset_ID']: ln for ln in ds['loans.csv']} for ccid, cc in data['CognateClass'].items(): if ccid in loans: le = loans[ccid] if le['SourceCognateset_ID']: cc.loan_source_pk = data['CognateClass'][le['SourceCognateset_ID']].pk else: cc.loan_source_pk = None cc.loan_notes = le['Comment'] cc.loan_source_languoid = le['Source_languoid'] cc.loan_source_form = le['Source_form'] cc.parallel_loan_event = le['Parallel_loan_event'] cc.is_loan = True for row in ds['CognateTable']: cc = data['CognateClass'][row['Cognateset_ID']] if cc.meaning_pk is None: cc.meaning_pk = data['Lexeme'][row['Form_ID']].valueset.parameter_pk else: assert data['Lexeme'][row['Form_ID']].valueset.parameter_pk == cc.meaning_pk data.add( clld_cognacy_plugin.models.Cognate, row['ID'], cognateset=data['CognateClass'][row['Cognateset_ID']], counterpart=data['Lexeme'][row['Form_ID']], doubt=row['Doubt'], ) l_by_gc = {} for s in DBSession.query(models.Variety): l_by_gc[s.glottocode] = s.pk tree = Phylogeny( id='1', name='Bouckaert et al.', description='', newick=Path.read_text(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'bouckaert_et_al2012' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree, description=taxon.glottocode) if taxon.glottocode in l_by_gc: LanguageTreeLabel(language_pk=l_by_gc[taxon.glottocode], treelabel=label) DBSession.add(tree) l_by_ascii = {} for s in DBSession.query(models.Variety): l_by_ascii[s.ascii_name] = s.pk tree = Phylogeny( id='2', name='CoBL consensu', description='', newick=Path.read_text(data_file_path / 'raw' / 'ie122' / 'newick.txt'), ) for k, taxon in enumerate(reader(data_file_path / 'raw' / 'ie122' / 'taxa.csv', namedtuples=True)): label = TreeLabel( id='{0}-{1}-{2}'.format(tree.id, slug(taxon.taxon), k + 1), name=taxon.taxon, phylogeny=tree) if taxon.taxon in l_by_ascii: LanguageTreeLabel(language_pk=l_by_ascii[taxon.taxon], treelabel=label) DBSession.add(tree)
def main(args): data = Data() dataset = common.Dataset( id=amsd.__name__, name="AMSD", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='amsd.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Piers Kelly', None)]) # data_entry => Contributor for row in sorted(dicts('data_entry'), key=lambda x: [ x['name'].lower()] ): if row['name'] in editors: editors[row['name']] = row['pk'] data.add( common.Contributor, row['pk'], id=row['pk'], name=row['name'] ) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1) for row in dicts('source_citation'): data.add( common.Source, row['pk'], id=row['pk'], note=row['name'], name=row['name'], ) for row in dicts('ling_area'): data.add( models.ling_area, row['pk'], chirila_name = row['chirila_name'], austlang_code = row['austlang_code'], austlang_name = row['austlang_name'], glottolog_code = row['glottolog_code'], ) fd = {} for row in dicts('linked_filenames'): if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']: fd[row['pk']] = dict( name = row['name'], oid = row['oid'], path = row['path'], mimetype = mimetypes.guess_type(row['path'])[0], ) for m in 'item_type technique keywords material source_type sem_domain holder_file'.split(): for row in dicts(m): data.add( getattr(models, m), row['pk'], name = row['name'], ) DBSession.flush() # sticks => MessageStick no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type', 'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry', 'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file'] x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords', 'holder_file', 'item_type'] for i, row in enumerate(dicts('sticks')): fts_items = [] for col in row.keys(): if col: if col == 'amsd_id': fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),) elif col not in no_fts_cols and not col.endswith('_pk'): fts_items.append(row[col]) for t in x_cols: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(str(data[t][k])) fts_items.extend(str(data[t][k]).split('_')) for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(data['ling_area'][k].chirila_name) fts_items.append(data['ling_area'][k].austlang_code) fts_items.append(data['ling_area'][k].austlang_name) fts_items.append(data['ling_area'][k].glottolog_code) if row['source_citation']: for k in row['source_citation'].split(';'): data.add( common.ContributionReference, k, contribution_pk = int(row['pk']), source_pk = int(k), ) fts_items.append(str(data['Source'][k])) if row['linked_filenames']: for j, k in enumerate(row['linked_filenames'].split(';')): if k in fd: oid = fd[k].get('oid') mt = fd[k].get('mimetype') refobjid = '' if mt == 'application/pdf': refobjid = oid # use for web, thumbnail a place holder image oid = 'EAEA0-52CC-0295-6B71-0' n = fd[k].get('name') data.add( common.Contribution_files, k, id='%s-%s-%i' % (k, row['pk'], j), object_pk = int(row['pk']), name = n, jsondata = dict( original = fd[k].get('path'), objid = oid, refobjid = refobjid, web = 'web.jpg', thumbnail = 'thumbnail.jpg', ), ord=j, mime_type = mt, ) fts_items.append(n) fts_items.extend(nfilter(re.split('[_\-\.]', n))) data.add( models.MessageStick, row['pk'], id = row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i), title = row['title'], description = row['description'], obj_creator = row['obj_creator'], date_created = row['date_created'], note_place_created = row['note_place_created'], place_created = row['place_created'], item_type_pk = row['item_type'] or None, ling_area_1_pk = row['ling_area_1'] or None, ling_area_2_pk = row['ling_area_2'] or None, ling_area_3_pk = row['ling_area_3'] or None, notes_ling_area = row['notes_ling_area'], stick_term = row['stick_term'], message = row['message'], motifs = row['motifs'], motif_transcription = row['motif_transcription'], dim_1 = row['dim_1'], dim_2 = row['dim_2'], dim_3 = row['dim_3'], date_collected = row['date_collected'], holder_file_pk = row['holder_file'] or None, holder_obj_id = row['holder_obj_id'], collector = row['collector'], place_collected = row['place_collected'], creator_copyright = row['creator_copyright'], file_copyright = row['file_copyright'], latitude = row['lat'] or None, longitude = row['long'] or None, notes_coords = row['notes_coords'], url_institution = row['url_institution'], url_source_1 = row['url_source_1'], url_source_2 = row['url_source_2'], irn = row['irn'], notes = row['notes'], data_entry = row['data_entry'], fts = fts.tsvector('\n'.join(re.sub('[_\-]','.',v) for v in fts_items)), ) DBSession.flush() for row in dicts('sticks'): for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']: if row[t]: for _, k in enumerate(row[t].split(';')): data.add( getattr(models, 'x_%s' % (t)), k, object_pk = int(row['pk']), item_pk = int(k), )
def main(args): Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\ .create(DBSession.bind) data = Data() dataset = common.Dataset( id=numerals.__name__, name="Numeralbank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain="numerals.clld.org", jsondata={ "license_icon": "cc-by.png", "license_name": "Creative Commons Attribution 4.0 International License", }, ) DBSession.add(dataset) for i, (id_, name) in enumerate( [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")] ): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) # Take meta data from curated CLDF data set ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Parameters: for parameter in ds["ParameterTable"]: data.add( models.NumberParameter, parameter["ID"], id=parameter["ID"], name="{0}".format(parameter["ID"]), concepticon_id=parameter['Concepticon_ID'], ) basis_parameter = data.add( models.NumberParameter, "0", id="0", name="Base", ) load_family_langs = [] for language in ds["LanguageTable"]: lang = data.add( models.Variety, language["ID"], id=language["ID"], name=language["Name"], latitude=language["Latitude"], longitude=language["Longitude"], creator=language["Contributor"], comment=language["Comment"], url_soure_name=language["SourceFile"], ) if language["Glottocode"]: load_family_langs.append((language["Glottocode"], lang)) # get orginal forms ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json') org_forms = {f["ID"]: f for f in ds["FormTable"]} d = data_repos[1] contrib = data.add( common.Contribution, d['id'], id=d['id'], name=d['name'] ) # process curated forms ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Add Base info if given for language in ds["LanguageTable"]: if language["Base"]: basis = language["Base"] de = data["DomainElement"].get(basis) if not de: de = data.add( common.DomainElement, basis, id=text_type(basis), name=text_type(basis), parameter=basis_parameter, ) vs = data.add( common.ValueSet, data["Variety"][language["ID"]].id, id=data["Variety"][language["ID"]].id, language=data["Variety"][language["ID"]], parameter=basis_parameter, contribution=contrib, ) common.Value( id=data["Variety"][language["ID"]].id, valueset=vs, domainelement=de ) # Forms: for form in ds["FormTable"]: valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"]) valueset = data["ValueSet"].get(valueset_id) # Unless we already have something in the VS: if not valueset: if form["Language_ID"] in data["Variety"]: vs = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data["Variety"][form["Language_ID"]], parameter=data["NumberParameter"][form["Parameter_ID"]], contribution=contrib, ) org_form = "" if form["ID"] in org_forms: if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]: org_form = org_forms[form["ID"]]["Form"] else: org_form = "no original form" DBSession.add( models.NumberLexeme( id=form["ID"], name=form["Form"], comment=form["Comment"], is_loan=form["Loan"], other_form=form["Other_Form"], org_form=org_form, is_problematic=form["Problematic"], valueset=vs, ) ) load_families( Data(), load_family_langs, glottolog_repos=gl_repos, strict=False, ) distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all() families = dict( zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties))) ) for l in DBSession.query(models.Variety): l.jsondata = {"color": families[l.family_pk]} p = common.Parameter.get("0") colors = color.qualitative_colors(len(p.domain)) for i, de in enumerate(p.domain): de.jsondata = {"color": colors[i]}
def main(args): # pragma: no cover glottocodes = {} for row in GC.execute( 'select ll.hid, l.id from language as l, languoid as ll where ll.pk = l.pk' ): if row[0] and len(row[0]) == 3: glottocodes[row[0]] = row[1] icons = issues.Icons() old_db = DB vs2008 = get_vs2008(args) missing_sources = [] refdb_ids = {} max_id = 7350 with open('/home/robert/venvs/clld/data/wals-data/missing_source.py', 'w') as fp: for row in old_db.execute("select * from reference"): try: author, year = row['id'].split('-') except: author, year = None, None bibdata = get_source(row['id']) if not bibdata: fp.write('"%s",\n' % row['id']) missing_sources.append(row['id']) bibdata['pk'] = max_id max_id += 1 if bibdata['pk'] in refdb_ids: print('already seen:', row['id'], 'as', refdb_ids[bibdata['pk']]) data['Source'][row['id']] = data['Source'][refdb_ids[ bibdata['pk']]] continue refdb_ids[bibdata['pk']] = row['id'] bibdata.update({ 'id': row['id'], 'name': row['name'], 'description': bibdata.get('title', bibdata.get('booktitle')), 'google_book_search_id': row['gbs_id'] or None, }) data.add(common.Source, row['id'], **bibdata) # # TODO: add additional bibdata as data items # print('sources missing for %s refs' % len(missing_sources)) for id, name in ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id, name=name)) migrate( 'country', models.Country, lambda r: (r['id'], dict(id=r['id'], name=r['name'], continent=r['continent']))) migrate( 'family', models.Family, lambda r: (r['id'], dict(id=r['id'], name=r['name'], description=r['comment']))) for row, icon in zip( list(old_db.execute("select * from genus order by family_id")), cycle(iter(icons))): genus = data.add(models.Genus, row['id'], id=row['id'], name=row['name'], icon=icon, subfamily=row['subfamily']) genus.family = data['Family'][row['family_id']] DBSession.flush() migrate( 'altname', common.Identifier, lambda r: ((r['name'], r['type']), dict(name=r['name'], type='name', description=r['type']))) # names for isolanguages are not unique! enames = {} for r in DB.execute("select * from isolanguage"): id_ = 'ethnologue-%s' % r['id'] if r['name'] in enames: data['Identifier'][id_] = enames[r['name']] else: enames[r['name']] = data.add(common.Identifier, id_, id=id_, name=r['name'], type='name', description='ethnologue') DBSession.flush() migrate( 'isolanguage', common.Identifier, lambda r: (r['id'], dict(id=r['id'], name=r['id'], type=common.IdentifierType.iso.value, description=r['name']))) migrate( 'isolanguage', common.Identifier, lambda r: None if r['id'] not in glottocodes else ('gc-%s' % r['id'], dict(id='gc-%s' % r['id'], name=glottocodes[r['id']], type=common.IdentifierType.glottolog.value, description=r['name']))) migrate( 'language', models.WalsLanguage, lambda r: (r['id'], dict(id=r['id'], name=r['name'], latitude=r['latitude'], longitude=r['longitude'], ascii_name=r['ascii_name'], genus=data['Genus'][r['genus_id']], samples_100=r['samples_100'] != 0, samples_200=r['samples_200'] != 0))) migrate( 'author', common.Contributor, lambda r: (r['id'], dict(name=r['name'], url=r['www'], id=r['id'], description=r['note']) )) dataset = common.Dataset( id='wals', name='WALS Online', description='The World Atlas of Language Structures Online', domain='wals.info', published=date(2013, 8, 15), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) DBSession.add(dataset) for i, editor in enumerate(['dryerms', 'haspelmathm']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) migrate( 'country_language', models.CountryLanguage, lambda r: dict(language_pk=data['WalsLanguage'][r['language_id']].pk, country_pk=data['Country'][r['country_id']].pk)) migrate( 'altname_language', common.LanguageIdentifier, lambda r: dict(language=data['WalsLanguage'][r['language_id']], identifier=data['Identifier'][ (r['altname_name'], r['altname_type'])], description=r['relation'])) migrate( 'isolanguage_language', common.LanguageIdentifier, lambda r: dict(language=data['WalsLanguage'][r['language_id']], identifier=data['Identifier'][r['isolanguage_id']], description=r['relation'])) migrate( 'isolanguage_language', common.LanguageIdentifier, lambda r: None if 'ethnologue-%s' % r['isolanguage_id'] not in data['Identifier'] else dict(language=data['WalsLanguage'][r['language_id']], identifier=data['Identifier']['ethnologue-%s' % r['isolanguage_id' ]], description=r['relation'])) migrate( 'isolanguage_language', common.LanguageIdentifier, lambda r: None if 'gc-%s' % r['isolanguage_id'] not in data['Identifier'] else dict( language=data['WalsLanguage'][r['language_id']], identifier=data['Identifier']['gc-%s' % r['isolanguage_id']], description=r['relation'])) migrate( 'area', models.Area, lambda r: (r['id'], dict(name=r['name'], dbpedia_url=r['dbpedia_url'], id=str(r['id'])))) def migrate_chapter(row): kw = dict(id=row['id'], name=row['name'], wp_slug=row['blog_title'], sortkey=int(row['id']), area=data['Area'][row['area_id']]) if int(row['id']) in [143, 144]: kw['created'] = E2011 kw['updated'] = E2011 return row['id'], kw migrate('chapter', models.Chapter, migrate_chapter) def migrate_supplement(row): if row['name'] not in ['Help', 'Abbreviations']: sortkey = 990 + int( row['id']) if row['name'] != 'Introduction' else 0 id_ = 's%s' % row['id'] kw = dict(id=id_, name=row['name'], sortkey=sortkey) return id_, kw migrate('supplement', models.Chapter, migrate_supplement) migrate( 'chapter_reference', common.ContributionReference, lambda r: dict(contribution=data['Chapter'][r['chapter_id']], source=data['Source'][r['reference_id']])) migrate( 'reference_supplement', common.ContributionReference, lambda r: dict( contribution=data['Chapter']['s%s' % r['supplement_id']], source=data['Source'][r['reference_id']])) def migrate_feature(row): kw = dict(id=row['id'], name=row['name'], ordinal_qualifier=row['id'][-1]) if row['id'].startswith('143') or row['id'].startswith('144'): kw['created'] = E2011 kw['updated'] = E2011 kw['chapter'] = data['Chapter'][row['chapter_id']] return row['id'], kw migrate('feature', models.Feature, migrate_feature) def migrate_value(row): desc = row['description'] if desc == 'SOV & NegV/VNeg': if row['icon_id'] != 's9ff': desc += ' (a)' else: desc += ' (b)' kw = dict(id='%s-%s' % (row['feature_id'], row['numeric']), name=desc, description=row['long_description'], jsondata=dict(icon=issues.Icons.id(row['icon_id'])), number=row['numeric'], parameter=data['Feature'][row['feature_id']]) return (row['feature_id'], row['numeric']), kw migrate('value', common.DomainElement, migrate_value) same = 0 added = 0 for row in old_db.execute("select * from datapoint"): parameter = data['Feature'][row['feature_id']] language = data['WalsLanguage'][row['language_id']] id_ = '%s-%s' % (parameter.id, language.id) created = E2008 updated = E2008 value_numeric = row['value_numeric'] if (language.id, parameter.id) in vs2008: if vs2008[(language.id, parameter.id)] != row['value_numeric']: print('~~~', id_, vs2008[(language.id, parameter.id)], '-->', row['value_numeric']) value_numeric = vs2008[(language.id, parameter.id)] else: same += 1 else: updated = E2011 created = E2011 if parameter.id[-1] == 'A' and not ( parameter.id.startswith('143') or parameter.id.startswith('144')): added += 1 kw = dict(id=id_, updated=updated, created=created) valueset = data.add(common.ValueSet, row['id'], language=language, parameter=parameter, contribution=parameter.chapter, **kw) data.add(common.Value, id_, domainelement=data['DomainElement'][(row['feature_id'], value_numeric)], valueset=valueset, **kw) print(same, 'datapoints did not change') print(added, 'datapoints added to existing features') DBSession.flush() migrate( 'datapoint_reference', common.ValueSetReference, lambda r: dict(valueset=data['ValueSet'][r['datapoint_id']], source=data['Source'][r['reference_id']], description=r['note'])) migrate( 'author_chapter', common.ContributionContributor, lambda r: dict(ord=r['order'], primary=r['primary'] != 0, contributor_pk=data['Contributor'][r['author_id']].pk, contribution_pk=data['Chapter'][r['chapter_id']].pk)) migrate( 'author_supplement', common.ContributionContributor, lambda r: dict( ord=r['order'], primary=r['primary'] != 0, contributor_pk=data['Contributor'][r['author_id']].pk, contribution_pk=data['Chapter']['s%s' % r['supplement_id']].pk)) igts = defaultdict(lambda: []) for row in old_db.execute("select * from igt"): d = {'id': 'igt-%s' % row['id']} d.update(parse_igt(row['xhtml'])) igts[row['example_id']].append(d) for row in old_db.execute("select * from example"): if not row['language_id']: print('example without language:', row['id']) continue _igts = igts[row['id']] if _igts: for igt in _igts: data.add(common.Sentence, igt['id'], markup_comment=row['xhtml'], language=data['WalsLanguage'][row['language_id']], **igt) else: name = teaser(row['xhtml']) if name: data.add(common.Sentence, row['id'], id=str(row['id']), name=name, xhtml=row['xhtml'], language=data['WalsLanguage'][row['language_id']]) missing = {} for row in old_db.execute("select * from example_feature"): _igts = igts[row['example_id']] if _igts: for igt in _igts: try: sentence = data['Sentence'][igt['id']] except KeyError: print('missing sentence:', row['example_id']) continue try: value = data['Value']['%s-%s' % (row['feature_id'], sentence.language.id)] DBSession.add( common.ValueSentence(sentence=sentence, value=value)) except KeyError: missing[(row['feature_id'], sentence.language.id)] = 1 else: try: sentence = data['Sentence'][row['example_id']] except KeyError: print('missing sentence:', row['example_id']) continue try: value = data['Value']['%s-%s' % (row['feature_id'], sentence.language.id)] DBSession.add( common.ValueSentence(sentence=sentence, value=value)) except KeyError: missing[(row['feature_id'], sentence.language.id)] = 1 print(len(missing), 'missing datapoints for example_feature relations')
def main(args): meta = parse_meta(args) print(len(meta)) print(sum(len(m.sources) for m in meta.values())) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None print(len(sources), 'distinct') for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = { row[0]: row[0] for row in wals.execute('select id from genus') } with args.data_file('listss17.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'CB', 'EH']: DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add(models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) assert not list(meta.keys())