def update_providers(args): if not args.data_file(args.version, 'provider.txt').exists(): return with open(args.data_file(args.version, 'provider.txt')) as fp: content = fp.read().decode('latin1') if '\r\n' in content: content = content.replace('\r\n', '\n') provider_map = get_map(Provider) for block in content.split('\n\n\n\n'): lines = block.split('\n') id_, abbr = lines[0].strip().split(':') id_ = id_.split('.')[0] description = unescape('\n'.join(lines[1:])) name = description.split('.')[0] if id_ == 'hedvig-tirailleur': id_ = u'skirgard' if slug(id_) not in provider_map: args.log.info('adding provider %s' % slug(id_)) DBSession.add( Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def main(args): data = Data() dataset = common.Dataset( id=u'An Crúbadán', name=u'An Crúbadán', publisher_name="Saint Louis University", publisher_place="Saint Louis, USA", publisher_url="http://www.slu.edu/", description= "Linguistic datasets for over 2000 languages created from web-crawled text corpora", contact="*****@*****.**", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'https://licensebuttons.net/l/by/4.0/88x31.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='crubadan.org', ) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Kevin Scannell", id="Kevin Scannell", name="Kevin Scannell", email="*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() fillTable(DBSession)
def update_lang(lang, **kw): """ store original name in hname .. notes:: We don't update the alternative names (for name search) here, instead, the script to update these names in bulk must be run after this function. """ name = kw.pop('name', None) if name and name != lang.name: if 'hname' not in lang.jsondatadict: lang.update_jsondata(hname=lang.name) print 'renamed', lang.name, 'to', name lang.name = name print lang.jsondata for k, v in kw.items(): if k not in lang.datadict(): DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk)) else: for d in lang.data: if d.key == k and d.value != v: print 'updated', k d.value = v break
def update(args): count = 0 assert args.json iid = int(DBSession.execute( "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1 pk = DBSession.execute( "select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier( pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def main(args): user = getpass.getuser() data = Data() datadir = 'C:\\Python27\\glottobank\\Grambank\\' if user != 'robert' \ else '/home/robert/venvs/glottobank/Grambank' dataset = common.Dataset( id=grambank.__name__, name="GramBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(datadir, data) import_cldf(os.path.join(datadir, 'datasets'), data) #print data.keys() #print data['Parameter'].keys() #parameter = data['Parameter'].get(row['Feature_ID']) load_families(data, data['GrambankLanguage'].values(), isolates_icon='tcccccc')
def update_lang(lang, **kw): """ store original name in hname .. notes:: We don't update the alternative names (for name search) here, instead, the script to update these names in bulk must be run after this function. """ name = kw.pop('name', None) if name and name != lang.name: if 'hname' not in lang.jsondata: lang.update_jsondata(hname=lang.name) print 'renamed', lang.name, 'to', name lang.name = name print lang.jsondata for k, v in kw.items(): if k not in lang.datadict(): DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk)) else: for d in lang.data: if d.key == k and d.value != v: print 'updated', k d.value = v break
def import_cognatesets(dataset, forms, bibliography, contribution, cognatesets={}): cognateset_by_formid = {} cognateset_forms = {} for row in dataset["CognateTable"].iterdicts(): # Only incorporate the newest cognate codings, and be robust about that try: cs = cognateset_forms.setdefault(row["Cognateset_ID"], []) cs.append(forms[row["Form_ID"]].name) row["CognateForms"] = cs cognateset_by_formid[row["Form_ID"]] = row except KeyError: continue for row in cognateset_by_formid.values(): cognateset_id = row["Cognateset_ID"] try: cognateset = cognatesets[cognateset_id] except KeyError: row["CognateForms"].sort() cognateset = cognatesets[cognateset_id] = Cognateset( id=row["Cognateset_ID"], contribution=contribution, name=row["CognateForms"][len(row["CognateForms"])//2]) assoc = ( CognatesetCounterpart( cognateset=cognateset, doubt=True if "LexStat" in row["Source"] else False, alignment=(None if not row["Alignment"] else " ".join(row["Alignment"])), counterpart=forms[row["Form_ID"]])) for source in row["Source"]: DBSession.add(CognatesetCounterpartReference( cognatesetcounterpart_pk=assoc.pk, source=bibliography[source]))
def main(args): datadir = '/home/robert/venvs/glottobank/lexibank' with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="LexiBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for provider in [ 'transnewguinea', 'abvd', 'ids', ]: import_cldf(os.path.join(datadir, provider, 'cldf'), provider) with transaction.manager: load_families(Data(), DBSession.query(LexibankLanguage), isolates_icon='tcccccc')
def main(args): data = Data() dataset = common.Dataset( id=culturebank.__name__, name="CultureBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='culturebank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'No license yet'}) # Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(CULTUREBANK_REPOS, data) import_cldf(os.path.join(CULTUREBANK_REPOS, 'datasets'), data) ##import_cldf("C:\\python27\\dbs\\bwohh\\", data, add_missing_features = True) load_families( data, list(data['CulturebankLanguage'].values()), isolates_icon='tcccccc') return
def update_providers(args, verbose=False): filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini') p = RawConfigParser() with io.open(filepath, encoding='utf-8-sig') as fp: p.readfp(fp) provider_map = get_map(Provider) for section in p.sections(): sectname = section[:-4] if section.endswith('.bib') else section id_ = slug(sectname) attrs = { 'name': p.get(section, 'title'), 'description': p.get(section, 'description'), 'abbr': p.get(section, 'abbr'), } if id_ in provider_map: provider = provider_map[id_] for a in list(attrs): before, after = getattr(provider, a), attrs[a] if before == after: del attrs[a] else: setattr(provider, a, after) attrs[a] = (before, after) if attrs: args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs))) if verbose: for a, (before, after) in attrs.items(): before, after = (' '.join(_.split()) for _ in (before, after)) if before != after: args.log.info('%s\n%r\n%r' % (a, before, after)) else: args.log.info('adding provider %s' % slug(id_)) DBSession.add(Provider(id=id_, **attrs))
def testapp(): from webtest import TestApp from clld.db.meta import DBSession, VersionedDBSession, Base from clld.db.models import common from clld_cognacy_plugin.models import Cognateset, Cognate def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': [ 'clld:web/templates', 'clld_cognacy_plugin:templates' ]}) cfg.include('clld.web.app') cfg.include('clld_cognacy_plugin') return cfg.make_wsgi_app() DBSession.remove() VersionedDBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) cs = Cognateset(id='1', name='cs: test') lang = common.Language(id='l', latitude=2, longitude=2) param = common.Parameter(id='l') vs = common.ValueSet(id='vs', language=lang, parameter=param) v = common.Value(id='v', name='abc', valueset=vs) DBSession.add(Cognate(cognateset=cs, counterpart=v)) yield TestApp(wsgi_app)
def main(args): data = Data() dataset = common.Dataset( id=cdk.__name__, name="CDK", description="Comprehensive Dictionary of Ket", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cdk.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='ket', name=dataset.name) DBSession.add(contrib) for i, (id, name) in enumerate([ ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'), ]): dataset.editors.append( common.Editor(contributor=common.Contributor(id=id, name=name), ord=i)) ket = data.add( common.Language, 'ket', id='ket', name='Ket', latitude=63.76, longitude=87.55) add_language_codes(data, ket, 'ket', glottocode='kett1243') for abbr, name in DIALECTS.items(): data.add(common.Language, abbr, id=abbr, name=name) with args.data_file('sources.txt').open(encoding='utf8') as fp: for i, chunk in enumerate(fp.read().split('\n\n\n')): try: id_, year, author, desc = chunk.split('\n') except: print(chunk) raise data.add( common.Source, id_, id=str(i + 1), name=id_, author=author, year=year, description=desc) with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader: load(data, reader, ket, contrib, verbs=False) with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader: load(data, reader, ket, contrib) print('parsing examples problematic in %s cases' % len(PROBLEMS))
def import_sources(wordlist, contribution, contributors = {}): """Load the bibliography """ contributions = {} by_name = {} for source in wordlist.sources.items(): fields = source.entry.fields # Generate a citation from the source citation_contrib = None for role, people in source.entry.persons.items(): if not people: continue names = " and ".join(map(str, people)) fields[role] = names if not citation_contrib: if len(people) == 1: citation_contrib = " ".join(people[0].last_names) elif len(people) == 2: citation_contrib = "{:} & {:}".format(" ".join(people[0].last_names), " ".join(people[1].last_names)) else: citation_contrib = "{:} et al.".format(" ".join(people[0].last_names)) if citation_contrib: if fields.get("year"): name = "{:}, {:}".format(citation_contrib, fields["year"]) else: name = "{:}".format(citation_contrib) else: title_like = fields.get("title") or fields.get("note") if fields.get("year"): name = "{:}, {:}".format(title_like, fields["year"]) else: name = "{:}".format(title_like) if name in by_name: name = "{:}a".format(name) while name in by_name: name = name[:-1]+chr(ord(name[-1]) + 1) # create a contribution contrib = LexiRumahSource( id=source.id, name=name, bibtex_type=vars(EntryType).get(source.genre) or EntryType.misc, provider=contribution) for key, value in fields.items(): if hasattr(contrib, key) and not getattr(contrib, key): setattr(contrib, key, value) else: contrib.jsondata[key] = value DBSession.add(contrib) contributions[source.id] = contrib by_name[name] = contrib return contributions
def test_Dataset(self): from clld import RESOURCES from clld.db.models.common import Dataset, Source d = Dataset(id='abc', domain='test') DBSession.add(d) DBSession.flush() d.get_stats(RESOURCES, source=Source.id == None)
def test_CustomModelMixin(self): from clld.tests.fixtures import CustomLanguage DBSession.add(CustomLanguage(id='abc', name='Name', custom='c')) DBSession.flush() for lang in DBSession.query(Language).filter(Language.id == 'abc'): self.assertEqual(lang.custom, 'c') break
def main(args): # pragma: no cover global MAX_IDENTIFIER_PK with transaction.manager: MAX_IDENTIFIER_PK = DBSession.query( Identifier.pk).order_by(desc(Identifier.pk)).first()[0] gl_name = glottolog_name() gl_names = glottolog_names() languoids = {l.pk: l for l in DBSession.query(Languoid)} for attrs in jsonload(args.data_dir.joinpath('languoids', 'changes.json')): replacement = attrs.pop('replacement', None) hname = attrs.pop('hname', None) for name, enum in [('level', LanguoidLevel), ('status', LanguoidStatus)]: if name in attrs: attrs[name] = enum.from_string(attrs[name]) l = languoids.get(attrs['pk']) if l: for k, v in attrs.items(): setattr(l, k, v) # # We do not assign ISO codes for existing languages, because it could be # that the ISO code is now assigned to a family node, due to a change # request, e.g. see https://github.com/clld/glottolog-data/issues/40 # if len(l.hid or '') == 3 and not l.iso_code: args.log.warn('Language with hid %s but no iso code!' % l.hid) else: l = Languoid(**attrs) DBSession.add(l) languoids[l.pk] = l if len(attrs.get('hid', '')) == 3: create_identifier( None, l, name=attrs['hid'], type=IdentifierType.iso.value) create_identifier( gl_names.get(l.name), l, name=l.name, description=gl_name.description, type=gl_name.type) if hname: l.update_jsondata(hname=hname) if replacement: DBSession.add(Superseded( languoid_pk=l.pk, replacement_pk=replacement, relation='classification update')) DBSession.flush() recreate_treeclosure()
def test_Base_jsondata(db): l = Language(id='abc', name='Name') DBSession.add(l) DBSession.flush() l.update_jsondata(a=1) assert 'a' in l.jsondata l.update_jsondata(b=1) assert 'b' in l.jsondata and 'a' in l.jsondata assert 'b' in l.__json__(None)['jsondata']
def test_JSONEncodedDict(db): l = Language(id='abc', name='Name', jsondata={'i': 2}) DBSession.add(l) DBSession.flush() DBSession.expunge(l) for lang in DBSession.query(Language).filter(Language.id == 'abc'): assert lang.jsondata['i'] == 2 break
def create_identifier(identifier, l, **kw): global MAX_IDENTIFIER_PK if identifier is None: MAX_IDENTIFIER_PK += 1 DBSession.add(Identifier(pk=MAX_IDENTIFIER_PK, id=str(MAX_IDENTIFIER_PK), **kw)) pk = MAX_IDENTIFIER_PK else: pk = identifier.pk DBSession.add(LanguageIdentifier(language_pk=l.pk, identifier_pk=pk))
def test_Data(db): from clld.db.models.common import Language, Language_data l = Language(id='abc', name='Name') l.data.append(Language_data(key='abstract', value='c')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.datadict()['abstract'] == 'c'
def test_Data(self): from clld.db.models.common import Language, Language_data l = Language(id='abc', name='Name') l.data.append(Language_data(key='abstract', value='c')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) self.assertEqual(l.datadict()['abstract'], 'c')
def main(args): data = Data() dataset = common.Dataset( id=cognition.__name__, name="COSTATOL", description="Cognitive Structures across the Tree of Life", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cognition.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # for rec in Database.from_file(args.data_file('sources.bib')): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) contrib = common.Contribution(id='costatol', name='COSTATOL') for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True): param = data['Parameter'].get(datapoint['cognitive capacity']) if not param: name = datapoint['cognitive capacity'] param = data.add(common.Parameter, name, id=slug(name), name=name) species = data['Language'].get(datapoint['species']) if not species: name = datapoint['species'] species = data.add(common.Language, name, id=slug(name), name=name) vid = '%s-%s' % (species.id, param.id) vs = data.add( common.ValueSet, vid, id=vid, language=species, parameter=param, contribution=contrib) data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs) match = source_pattern.match(datapoint['source']) if match: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][match.group('key')], description=match.group('pages'))) for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True): data['Language'][species.name].longitude = species.longitude data['Language'][species.name].latitude = species.latitude
def migrate(from_, to_, converter): # pragma: no cover for row in DB.execute("select * from %s" % from_): res = converter(row) if not res: continue if isinstance(res, dict): DBSession.add(to_(**res)) else: data.add(to_, res[0], **res[1]) DBSession.flush()
def test_Base(db): l = Language(id='abc', name='Name') DBSession.add(l) DBSession.flush() DBSession.expunge(l) l = Language.get('abc', session=DBSession) assert l.name == 'Name' Language().__str__() assert repr(l) == "<Language 'abc'>"
def test_compute_language_sources(self): from clld.db.models.common import Source, Sentence, Language, SentenceReference from clld.db.meta import DBSession from clld.db.util import compute_language_sources s = Sentence(id='sentenced', language=Language(id='newlang')) sr = SentenceReference(sentence=s, source=Source.first()) DBSession.add(sr) DBSession.flush() compute_language_sources()
def add_morpheme_reference(morpheme, source_string): bib_key, pages = get_key_and_page(source_string) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.MorphemeReference( morpheme=morpheme, source=source, key=source.id, description=pages.replace("--","–") ) )
def import_features(cldf, contributors): # pragma: no cover """ ? = gray cbbbbbb (is ? mapped? if not then don't worry) 0 = blue c0077bb 1 = red ccc3311 2 = teal c009988 3 = orange cee7733 """ features, codes = {}, {} icons = [ 'cffffff', # 'c0077bb' 'cff0000', # 'ccc3311' 'c0000ff', # 'c009988' 'cffff00', # 'cee7733' ] domains = {} for fid, des in itertools.groupby( sorted(cldf['CodeTable'], key=lambda c: c['Parameter_ID']), lambda c: c['Parameter_ID']): domains[fid] = list(des) + [ dict(ID=fid + '-NA', Name='?', Description='Not known') ] for feature in tqdm(list(cldf['ParameterTable']), desc='loading features'): fid = feature['ID'] f = Feature( id=fid, name=feature['Name'], description=feature['Description'], ) for ord, patron in enumerate(feature['Patrons'], start=1): DBSession.add( FeaturePatron(ord=1, feature=f, contributor_pk=contributors[patron])) for code in domains[fid]: if code['Name'] == '?': icon, number, value = 'tcccccc', 999, None else: icon, number, value = icons[int(code['Name'])], int( code['Name']), code['Name'] DomainElement(id=code['ID'], parameter=f, name=code['Name'], number=number, description=code['Description'], jsondata=dict(icon=icon)) DBSession.add(f) DBSession.flush() features[fid] = f.pk for de in f.domain: codes[de.id] = de.pk return features, codes
def _addSource(lp): """For a lighter 'main' function.""" DBSession.add( common.Source(id=lp[0], name=lp[0], author=lp[2], year=lp[3], title=lp[4], url=lp[5], note=lp[6])) DBSession.flush()
def update(args): pid, cid = 'vitality', 'unesco' count = 0 notfound = {} contrib = common.Contribution.get(cid, default=None) if not contrib: contrib = common.Contribution( id=cid, name='Atlas of the World’s Languages in Danger', description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas') param = common.Parameter.get(pid, default=None) if param is None: param = common.Parameter( id=pid, name='Degree of endangerment') domain = {de.name: de for de in param.domain} for i, spec in enumerate(VITALITY_VALUES): name, desc = spec if name not in domain: number = i + 1 domain[name] = common.DomainElement( id='%s-%s' % (pid, number), name=name, description=desc, number=number, parameter=param) valuesets = {vs.id: vs for vs in param.valuesets} for item in reader(args.data_file(DATA_FILE), dicts=True): if item['ISO639-3 codes']: for code in item['ISO639-3 codes'].split(','): code = code.strip() lang = Languoid.get(code, key='hid', default=None) if lang: count += 1 item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code lang.update_jsondata(unesco=item) de = domain[item['Degree of endangerment']] vsid = '%s-%s' % (pid, lang.id) vs = valuesets.get(vsid) if not vs: vs = common.ValueSet( id='vitality-%s' % lang.id, parameter=param, contribution=contrib, language=lang) DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de)) valuesets[vsid] = vs else: vs.values[0].domainelement = de else: notfound[code] = 1 print 'assigned', count, 'unesco urls' print 'missing iso codes:', notfound
def add(self, model, key, **kw): if kw.keys() == ['_obj']: # if a single keyword parameter _obj is passed, we take it to be the object # which should be added to the session. new = kw['_obj'] else: for k, v in self.defaults.items(): kw.setdefault(k, v) new = model(**kw) self[model.mapper_name()][key] = new DBSession.add(new) return new
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def test_Files(self): from clld.db.models.common import Language, Language_files if PY3: return # pragma: no cover l = Language(id='abc', name='Name') assert l.iso_code is None l._files.append(Language_files(id='abstract')) DBSession.add(l) DBSession.flush() DBSession.refresh(l) f = l.files['abstract']
def add_refs(self, data, table, row, obj): if table == 'EntryTable': model, kw = models.WordReference, dict(word=obj) elif table == 'SenseTable': model, kw = models.MeaningReference, dict(meaning=obj) else: raise ValueError(table) refs_col = self.cldf.get((table, 'source')) if refs_col: for sid, context in map(self.cldf.sources.parse, row.get(refs_col.name, [])): if sid in data['DictionarySource']: DBSession.add(model( source=data['DictionarySource'][sid], description=context, **kw))
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def add(self, model, key, **kw): if '.' in kw.get('id', ''): raise ValueError('Object id contains illegal character "."') if list(kw.keys()) == ['_obj']: # if a single keyword parameter _obj is passed, we take it to be the object # which should be added to the session. new = kw['_obj'] else: for k, v in self.defaults.items(): kw.setdefault(k, v) new = model(**kw) self[model.__name__][key] = new DBSession.add(new) return new
def testapp(): def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': ['clldmpg:templates', 'clld:web/templates']}) cfg.include('clldmpg') return cfg.make_wsgi_app() DBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) yield ExtendedTestApp(wsgi_app)
def prime_cache(args): # add number of data points per parameter for np in DBSession.query(models.NumberParameter, func.count(common.Parameter.pk)) \ .join(common.Parameter) \ .join(common.ValueSet) \ .join(common.Value) \ .group_by(models.NumberParameter.pk, common.Parameter.pk): np[0].count_of_datapoints = np[1] # add number of distinct varieties per parameter based on assigned glottocodes for np in DBSession.query(models.NumberParameter, func.count(common.Identifier.name)) \ .join(common.ValueSet) \ .join(common.Value) \ .join(common.Language, common.ValueSet.language_pk == common.Language.pk) \ .join(common.LanguageIdentifier) \ .join(common.Identifier) \ .filter(common.Identifier.type == common.IdentifierType.glottolog.value) \ .group_by(models.NumberParameter.pk, common.Parameter.pk): np[0].count_of_varieties = np[1] # add number of data points of parameter "base" base_pk, cnt_base = DBSession.query(common.Parameter.pk, func.count(common.ValueSet.pk)) \ .join(common.Parameter) \ .filter(common.Parameter.name == 'Base') \ .group_by(common.Parameter.pk).all()[0] for np in DBSession.query(models.Parameter) \ .join(models.NumberParameter) \ .filter(common.Parameter.pk == base_pk): np.count_of_datapoints = cnt_base break DBSession.query(LanguageTreeLabel).delete() DBSession.query(TreeLabel).delete() DBSession.query(Phylogeny).delete() langs = [l for l in DBSession.query(common.Language) if l.glottocode] newick, _ = tree( [l.glottocode for l in langs], gl_repos=gl_repos ) phylo = Phylogeny(id="phy", name="glottolog global tree", newick=newick) for l in langs: LanguageTreeLabel( language=l, treelabel=TreeLabel(id=l.id, name=l.glottocode, phylogeny=phylo) ) DBSession.add(phylo)
def add_identifier(languoid, data, name, type, description, lang='en'): identifier = data['Identifier'].get((name, type, description, lang)) if not identifier: identifier = data.add(common.Identifier, (name, type, description, lang), id='{0}-{1}-{2}-{3}'.format( slug(name), slug(type), slug(description or ''), lang), name=name, type=type, description=description, lang=lang) DBSession.add( common.LanguageIdentifier(language=languoid, identifier=identifier))
def add(self, model, key, **kw): if '.' in kw.get('id', ''): raise ValueError('Object id contains illegal character "."') if kw.keys() == ['_obj']: # if a single keyword parameter _obj is passed, we take it to be the object # which should be added to the session. new = kw['_obj'] else: for k, v in self.defaults.items(): kw.setdefault(k, v) new = model(**kw) self[model.mapper_name()][key] = new DBSession.add(new) return new
def testapp(): def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': ['clldlucl:templates', 'clld:web/templates']}) cfg.include('clldlucl') return cfg.make_wsgi_app() DBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) yield ExtendedTestApp(wsgi_app)
def test_Files(db, tmppath): from clld.db.models.common import Sentence, Sentence_files l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(tmppath, 'content') assert Path(p).exists() l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def test_Files(db, tmppath): from clld.db.models.common import Sentence, Sentence_files l = Sentence(id='abc', name='Name') f = Sentence_files(object=l, id='abstract', mime_type='audio/mpeg') p = f.create(Path(tmppath), 'content') assert Path(p).exists() l._files.append(f) DBSession.add(l) DBSession.flush() DBSession.refresh(l) assert l.files assert l.audio
def add(self, model, key, **kw): if "." in kw.get("id", ""): raise ValueError('Object id contains illegal character "."') if list(kw.keys()) == ["_obj"]: # if a single keyword parameter _obj is passed, we take it to be the object # which should be added to the session. new = kw["_obj"] else: for k, v in self.defaults.items(): kw.setdefault(k, v) new = model(**kw) self[model.__name__][key] = new DBSession.add(new) return new
def add_language_codes(data, lang, isocode, glottocodes=None): def identifier(type_, id_): return data.add( common.Identifier, '%s:%s' % (type_, id_), id='%s:%s' % (type_, id_), name=id_, type=getattr(common.IdentifierType, type_).value) if isocode and len(isocode) == 3: DBSession.add(common.LanguageIdentifier( language=lang, identifier=identifier('iso', isocode))) if glottocodes and isocode in glottocodes: DBSession.add(common.LanguageIdentifier( language=lang, identifier=identifier('glottolog', glottocodes[isocode])))
def test_CsvMixin(db): l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None)) DBSession.add(l1) DBSession.flush() l1 = Language.csv_query(DBSession).first() cols = l1.csv_head() row = l1.to_csv() for k, v in zip(cols, row): if k == 'jsondata': assert 'a' in json.loads(v) l2 = Language.from_csv(row) assert pytest.approx(l1.latitude) == l2.latitude row[cols.index('latitude')] = '3,5' l2 = Language.from_csv(row) assert l2.latitude < l1.latitude
def test_CsvMixin(self): l1 = Language(id='abc', name='Name', latitude=12.4, jsondata=dict(a=None)) DBSession.add(l1) DBSession.flush() l1 = Language.csv_query(DBSession).first() cols = l1.csv_head() row = l1.to_csv() for k, v in zip(cols, row): if k == 'jsondata': self.assertIn('a', json.loads(v)) l2 = Language.from_csv(row) assert_almost_equal(l1.latitude, l2.latitude) row[cols.index('latitude')] = '3,5' l2 = Language.from_csv(row) self.assertLess(l2.latitude, l1.latitude)
def main(args): data = Data() # fetch language data from glottolog: glottolog = glottocodes_by_isocode( 'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude']) dataset = common.Dataset( id=jcld.__name__, name="Journal of Cross-Linguistic Databases", domain='jcld.clld.org') DBSession.add(dataset) contribution = data.add(common.Contribution, '1', id='1', name='fb') for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')): if row.Feature not in data['Parameter']: parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature) else: parameter = data['Parameter'][row.Feature] if row.Value not in data['DomainElement']: de = data.add( common.DomainElement, row.Value, id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value) else: de = data['DomainElement'][row.Value] if row.Language not in data['Language']: if row.Language not in glottolog: print '--->', row.Language continue glottocode, name, lat, lon = glottolog[row.Language] language = data.add( common.Language, row.Language, id=slug(row.Language), name=name, latitude=lat, longitude=lon) else: language = data['Language'][row.Language] id_ = str(i + 1) #'%s-%s' % (parameter.id, language.id) vs = common.ValueSet( id=id_, parameter=parameter, language=language, contribution=contribution, description=row.Comment, source=row.Source) common.Value(valueset=vs, name=row.Value, domainelement=de)
def add_file(self, type_, checksum, file_cls, obj): if checksum in self.cdstar: jsondata = {k: v for k, v in self.props.get(type_, {}).items()} jsondata.update(self.cdstar[checksum]) f = file_cls( id='%s-%s' % (obj.id, checksum), name=self.cdstar[checksum]['original'], object_pk=obj.pk, mime_type=self.cdstar[checksum]['mimetype'], jsondata=jsondata) DBSession.add(f) DBSession.flush() DBSession.refresh(f) return print('{0} file missing: {1}'.format(type_, checksum)) return
def add_values(data, dblang, pid, values, with_de=True, **vskw): vs = None for i, (vid, vname) in enumerate(values): if i == 0: vs = common.ValueSet( id=idjoin(pid, dblang.id), language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['glottolog'], **vskw) vkw = dict(id=idjoin(pid, slug(vid), dblang.id), name=vname, valueset=vs) if with_de: vkw['domainelement'] = data['DomainElement'][pid, vid] DBSession.add(common.Value(**vkw))
def _addEditor(dataset, count, lp): """For a lighter 'main' function.""" eds = ['Frank Seifart', 'Ludger Paschen', 'Matthew Stave'] ed = dorEditor(id=lp[0], name=lp[0], url=lp[1], email=lp[2], address=lp[3], team=lp[4], function=lp[5]) if lp[0] in eds: common.Editor(dataset=dataset, contributor=ed, ord=count + 1) count += 1 DBSession.add(ed) DBSession.flush() return dataset, count
def add_identifier(languoid, data, name, type, description, lang='en'): if len(lang) > 3: # Weird stuff introduced via hhbib_lgcode names. Roll back language parsing. name, lang = '{0} [{1}]'.format(name, lang), 'en' identifier = data['Identifier'].get((name, type, description, lang)) if not identifier: identifier = data.add( common.Identifier, (name, type, description, lang), id='{0}-{1}-{2}-{3}'.format( slug(name), slug(type), slug(description or ''), lang), name=name, type=type, description=description, lang=lang) DBSession.add(common.LanguageIdentifier(language=languoid, identifier=identifier))
def _addText(lp): """For a lighter 'main' function and because of checks.""" for a in range(1, len(lp)): if a == 18: if lp[a] == "no": lp[a] = False else: lp[a] = True elif a == 17: if not lp[a] or str(lp[a]).startswith("check"): lp[a] = 0 elif a == 9: genre = lp[9].lower() if genre == "personal narrative": genre = "pers. narr." elif genre == "traditional narrative": genre = "trad. narr." elif genre == "conversation": genre = "convers." elif genre == "stimulus-based": genre = "stimulus" lp[9] = genre elif not lp[a]: lp[a] = 'na' DBSession.add( doreContrib(id=lp[1], tname=lp[2], spks=lp[3], spks_age=lp[4], spks_agec=lp[5], spks_sex=lp[6], recdate=lp[7], recdatec=lp[8], genre=lp[9], subgenre=lp[10], gloss=lp[11], transl=lp[12], sound=lp[13], overlap=lp[14], process=lp[15], NAK=lp[16], glottocode=lp[0], words=lp[17], extended=lp[18])) DBSession.flush()