def __enter__(self): self.engine = engine_from_config(self.settings) DBSession.remove() DBSession.configure(bind=self.engine) assert DBSession.bind == self.engine Base.metadata.create_all(self.engine) return DBSession
def create_languoid(row, father_pk=None): glottocode = {'akun1242': 'akun1241'}.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add( common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id'])
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for concept in DBSession.query(models.ConceptSet): concept.representation = len(concept.valuesets) ul = [] for clist in DBSession.query(models.Conceptlist): clist.items = len(clist.valuesets) ul.append((clist.name, uniqueness(clist))) #for i, (n, u) in enumerate(sorted(ul, key=lambda t: t[1], reverse=True)): # if i > 10: # break # print n, u similarities = {} for cl1, cl2 in combinations(DBSession.query(models.Conceptlist), 2): s = similarity(cl1, cl2) similarities[(cl1.name, cl2.name)] = s for i, ((l1, l2), s) in enumerate(sorted(similarities.items(), key=lambda i: i[1], reverse=True)): if i < 20: print l1, l2, s if s == 0: pass
def dataset_detail_html(context=None, request=None, **kw): res = dict((row[0], row[1]) for row in DBSession.execute("select source, count(pk) from inventory group by source")) res["inventory_count"] = DBSession.query(Inventory).count() res["segment_count"] = DBSession.query(Parameter).count() res["language_count"] = DBSession.query(Language).count() res["contributors"] = ( DBSession.query(Contributor) .order_by(Contributor.name) .options(joinedload(Contributor.contribution_assocs), joinedload(Contributor.references)) .all() ) res["sources"] = { k: Source.get(k) for k in [ "moisikesling2011", "ipa2005", "hayes2009", "moran2012a", "moranetal2012", "cysouwetal2012", "mccloyetal2013", ] } res["descriptions"] = {c.id: desc(request, c.description, res["sources"]) for c in res["contributors"]} return res
def __init__(self, pk=None): if pk: self.obj = DBSession.query(amsd.models.MessageStick)\ .filter(amsd.models.MessageStick.pk == pk) else: self.obj = DBSession.query(amsd.models.MessageStick)\ .filter(amsd.models.MessageStick.latitude is not None)
def getRefs(params): query = DBSession.query(Ref) filtered = False for param, value in params['biblio'].items(): if value: filtered = True query = query.filter(icontains(getattr(Ref, param), value)) if params.get('languoids'): filtered = True lids = DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk.in_([l.pk for l in params['languoids']]))\ .subquery() query = query.join(LanguageSource, LanguageSource.source_pk == Ref.pk)\ .filter(LanguageSource.language_pk.in_(lids)) if params.get('doctypes'): filtered = True query = query.join(Refdoctype)\ .filter(Refdoctype.doctype_pk.in_([l.pk for l in params['doctypes']])) if params.get('macroareas'): filtered = True query = query.join(Refmacroarea)\ .filter(Refmacroarea.macroarea_pk.in_([l.pk for l in params['macroareas']])) if not filtered: return [] return query.distinct()
def update(args): count = 0 assert args.json iid = int(DBSession.execute( "select max(cast(id as integer)) from identifier").fetchone()[0]) + 1 pk = DBSession.execute( "select max(pk) from identifier").fetchone()[0] + 1 langs = {} for gid, name in args.json['wikipedia'].items(): if gid not in langs: langs[gid] = Languoid.get(gid) langs[gid].update_jsondata(wikipedia=name.split('/')[-1]) for gid, codes in args.json['multitree'].items(): l = langs[gid] lcodes = [i.name for i in l.identifiers if i.type == 'multitree'] for code in set(codes): if code not in lcodes: identifier = DBSession.query(common.Identifier)\ .filter(common.Identifier.type == 'multitree')\ .filter(common.Identifier.name == code)\ .first() if not identifier: identifier = common.Identifier( pk=pk, id=str(iid), name=code, type='multitree') iid += 1 pk += 1 count += 1 DBSession.add( common.LanguageIdentifier(language=l, identifier=identifier)) print count, 'new multitree identifiers'
def dataset_detail_html(context=None, request=None, **kw): example_image_name = '20161106-ebay-s-l1600_0.jpg' example = None try: example_context = DBSession.query(Contribution_files) \ .filter(Contribution_files.name == example_image_name).all()[0] example = { 'link_url': '%s/%s' % (request.route_url('images'), example_context.id), 'image_url': cdstar.bitstream_url(example_context) } except IndexError: # pragma: no cover pass return { 'count_sticks': len(DBSession.query(amsd.models.MessageStick).all()), 'count_ling_areas': len(DBSession.query(amsd.models.ling_area).all()), 'example': example, 'count_terms': len( DBSession.query(amsd.models.MessageStick.stick_term).filter( amsd.models.MessageStick.stick_term != '').distinct().all()), }
def contribution_detail_html(context=None, request=None, **kw): if context.id == 's4': raise HTTPFound(request.route_url('genealogy')) c = context.description if '<body>' in c: c = c.split('<body>')[1].split('</body>')[0] adapter = get_adapter(IRepresentation, Feature(), request, ext='snippet.html') fids = [ m.group('fid') for m in re.finditer('__values_(?P<fid>[0-9A-Z]+)__', c) ] for feature in DBSession.query(Feature)\ .filter(Feature.id.in_(fids))\ .options(joinedload(Feature.domain)): counts = DBSession.query(Value.domainelement_pk, func.count(Value.pk))\ .filter(Value.domainelement_pk.in_([de.pk for de in feature.domain]))\ .group_by(Value.domainelement_pk) feature.counts = dict(counts) table = soup(adapter.render(feature, request)) values = '\n'.join('%s' % table.find(tag).extract() for tag in ['thead', 'tbody']) c = c.replace('__values_%s__' % feature.id, values) return {'text': c.replace('http://wals.info', request.application_url)}
def main(args): user = getpass.getuser() data = Data() datadir = 'C:\\Python27\\glottobank\\Grambank\\' if user != 'robert' \ else '/home/robert/venvs/glottobank/Grambank' dataset = common.Dataset( id=grambank.__name__, name="GramBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(datadir, data) import_cldf(os.path.join(datadir, 'datasets'), data) #print data.keys() #print data['Parameter'].keys() #parameter = data['Parameter'].get(row['Feature_ID']) load_families(data, data['GrambankLanguage'].values(), isolates_icon='tcccccc')
def update_lang(lang, **kw): """ store original name in hname .. notes:: We don't update the alternative names (for name search) here, instead, the script to update these names in bulk must be run after this function. """ name = kw.pop('name', None) if name and name != lang.name: if 'hname' not in lang.jsondata: lang.update_jsondata(hname=lang.name) print 'renamed', lang.name, 'to', name lang.name = name print lang.jsondata for k, v in kw.items(): if k not in lang.datadict(): DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk)) else: for d in lang.data: if d.key == k and d.value != v: print 'updated', k d.value = v break
def glottologmeta(request): q = DBSession.query(Languoid)\ .filter(Language.active == True)\ .filter(or_(Languoid.status == LanguoidStatus.established, Languoid.status == LanguoidStatus.unattested)) qt = q.filter(Languoid.father_pk == None) res = { 'last_update': DBSession.query(Language.updated)\ .order_by(desc(Language.updated)).first()[0], 'number_of_families': qt.filter(Languoid.level == LanguoidLevel.family).count(), 'number_of_isolates': qt.filter(Languoid.level == LanguoidLevel.language).count(), } ql = q.filter(Languoid.hid != None) res['number_of_languages'] = { 'all': ql.count(), 'pidgin': qt.filter(Language.name == 'Pidgin').one().child_language_count, 'artificial': qt.filter( Language.name == 'Artificial Language').one().child_language_count, 'sign': sum(l.child_language_count for l in qt.filter(Language.name.contains('Sign '))), } res['number_of_languages']['l1'] = res['number_of_languages']['all'] \ - res['number_of_languages']['pidgin']\ - res['number_of_languages']['artificial']\ - res['number_of_languages']['sign'] return res
def init_db(): engine = create_engine('sqlite://') DBSession.configure(bind=engine) VersionedDBSession.configure(bind=engine) Base.metadata.bind = engine Base.metadata.create_all() return engine
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby(sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa'))): if not instance.taxa: instance.active = False
def load_examples(self, dictionary, data, lang): abbr_p = re.compile('\$(?P<abbr>[a-z1-3][a-z]*(\.[a-z]+)?)') for i, ex in enumerate( Examples.from_file(self.dir.joinpath('processed', 'examples.sfm'))): obj = data.add( models.Example, ex.id, id='%s-%s' % (self.id, ex.id.replace('.', '_')), name=ex.text, number='{0}'.format(i + 1), source=ex.corpus_ref, language=lang, serialized='{0}'.format(ex), dictionary=dictionary, analyzed=ex.morphemes, gloss=abbr_p.sub(lambda m: m.group('abbr').upper(), ex.gloss) if ex.gloss else ex.gloss, description=ex.translation, alt_translation1=ex.alt_translation, alt_translation_language1=self.props.get('metalanguages', {}).get('gxx'), alt_translation2=ex.alt_translation2, alt_translation_language2=self.props.get('metalanguages', {}).get('gxy')) DBSession.flush() if ex.soundfile: self.add_file('audio', ex.soundfile, common.Sentence_files, obj)
def prime_cache(args): # pragma: no cover """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodiucally whenever data has been updated. """ q = DBSession.query(models.Transcription) \ .join(common.ValueSet, common.ValueSet.contribution_pk == models.Transcription.pk) \ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \ .join(models.Grapheme, models.Grapheme.pk == common.Value.pk) for t in q.filter(models.Grapheme.audio != None): t.with_audio = True for t in q.filter(models.Grapheme.image != None): t.with_image = True for t in q.filter(models.Grapheme.url != None): t.with_url = True for p in DBSession.query(common.Parameter) \ .options(joinedload(common.Parameter.valuesets).joinedload(common.ValueSet.values)): p.representation = sum(len(vs.values) for vs in p.valuesets) for p in DBSession.query(models.Feature).options(joinedload(models.Feature.sounds)): p.count_sounds = len(p.sounds) for p in DBSession.query(common.Contribution)\ .options(joinedload(common.Contribution.valuesets).joinedload(common.ValueSet.values)): p.items = sum(len(vs.values) for vs in p.valuesets)
def test_crud(self): from clld.db.migration import Connection migration = Connection(DBSession) assert len(list(migration.select(common.Identifier))) == 0 pk = migration.insert(common.Identifier, id='iso-csw', name='csw', type=common.IdentifierType.iso.value) assert migration.pk(common.Identifier, 'iso-csw') == pk assert len(list(migration.select(common.Identifier))) == 1 identifier = DBSession.query(common.Identifier).get(pk) assert identifier.active assert identifier.version == 1 assert identifier.created assert identifier.updated migration.update(common.Identifier, [('name', 'cea')], pk=pk) DBSession.refresh(identifier) assert identifier.name == 'cea' migration.delete(common.Identifier, pk=pk) self.assertRaises(InvalidRequestError, DBSession.refresh, identifier)
def put_languoid(request): glottocode = request.matchdict['glottocode'] languoid = query_languoid(DBSession, glottocode) if languoid is None: request.response.status = 404 return {'error': 'Not a valid languoid ID'} json_data = request.json_body try: data, errors = LanguoidSchema(partial=True).load(json_data) except ValueError: request.response.status = 400 return {'error': 'Not a valid languoid level'} if errors: request.response.status = 400 return {'error': errors} try: for key, value in data.items(): setattr(languoid, key, value) DBSession.flush() except exc.SQLAlchemyError as e: request.response.status = 400 DBSession.rollback() return {'error': "{}".format(e)} return LanguoidSchema().dump(languoid).data
def test_crud(self): from clld.db.migration import Connection migration = Connection(DBSession) assert len(list(migration.select(common.Identifier))) == 0 pk = migration.insert( common.Identifier, id='iso-csw', name='csw', type=common.IdentifierType.iso.value) assert migration.pk(common.Identifier, 'iso-csw') == pk assert len(list(migration.select(common.Identifier))) == 1 identifier = DBSession.query(common.Identifier)\ .options(undefer('*')).get(pk) assert identifier.active assert identifier.version == 1 assert identifier.created assert identifier.updated migration.update(common.Identifier, [('name', 'cea')], pk=pk) DBSession.refresh(identifier) assert identifier.name == 'cea' migration.delete(common.Identifier, pk=pk) self.assertRaises(InvalidRequestError, DBSession.refresh, identifier)
def main(args): #get_obsolete_refs(args) with transaction.manager: #match_obsolete_refs(args) # # TODO: # - create bibtex file containing all refs to be removed! # - # matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} for id_, repl in matched.items(): if not repl: continue ref = Ref.get(id_, default=None) if ref is None: continue Config.add_replacement(ref, repl, session=DBSession, model=Source) DBSession.delete(ref)
def glottologmeta(request): q = DBSession.query(Languoid)\ .filter(Language.active == true())\ .filter(Languoid.status.in_( (LanguoidStatus.established, LanguoidStatus.unattested))) qt = q.filter(Languoid.father_pk == null()) res = { 'last_update': DBSession.query(Language.updated) .order_by(Language.updated.desc()).first()[0], 'number_of_families': qt.filter(Languoid.level == LanguoidLevel.family).count(), 'number_of_isolates': qt.filter(Languoid.level == LanguoidLevel.language).count(), } ql = q.filter(Languoid.hid != null()) res['number_of_languages'] = {'all': ql.count()} res['special_families'] = OrderedDict() for name in SPECIAL_FAMILIES: l = qt.filter(Language.name == name).one() res['special_families'][name] = l res['number_of_languages'][name] = l.child_language_count res['number_of_languages']['l1'] = res['number_of_languages']['all'] \ - res['number_of_languages']['Pidgin']\ - res['number_of_languages']['Artificial Language']\ - res['number_of_languages']['Sign Language'] return res
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby( sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description ): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa')) ): if not instance.taxa: instance.active = False
def query(self, req): self._domainelements = DBSession.query(DomainElement).all() return DBSession.query(Language)\ .order_by(Language.id)\ .options( joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(WalsLanguage.genus, Genus.family))
def langdoccomplexquery(request): res = { 'dt': None, 'doctypes': DBSession.query(Doctype).order_by(Doctype.id), 'macroareas': DBSession.query(Macroarea).order_by(Macroarea.id), 'ms': {} } for name, cls, kw in [ ('languoids', LanguoidsMultiSelect, dict( url=request.route_url('glottolog.childnodes'))), ('macroareas', MultiSelect, dict(collection=res['macroareas'])), ('doctypes', MultiSelect, dict(collection=res['doctypes'])), ]: res['ms'][name] = cls(request, name, 'ms' + name, **kw) res['params'], reqparams = get_params(request.params, **res) res['refs'] = getRefs(res['params']) if res['refs']: res['dt'] = Refs(request, Source, cq=1, **reqparams) fmt = request.params.get('format') if fmt: db = bibtex.Database([ref.bibtex() for ref in res['refs']]) for name, adapter in request.registry.getAdapters([db], IRepresentation): if name == fmt: return adapter.render_to_response(db, request) return HTTPNotAcceptable() return res
def parameter_detail_html(context=None, request=None, **kw): values = DBSession.query(common.Value.pk)\ .join(common.ValueSet).filter(common.ValueSet.parameter_pk == context.pk)\ .subquery() return { 'examples': DBSession.query(common.Sentence).join(common.ValueSentence) .filter(common.ValueSentence.value_pk.in_(values))}
def update_providers(args): if not args.data_file(args.version, 'provider.txt').exists(): return with open(args.data_file(args.version, 'provider.txt')) as fp: content = fp.read().decode('latin1') if '\r\n' in content: content = content.replace('\r\n', '\n') provider_map = get_map(Provider) for block in content.split('\n\n\n\n'): lines = block.split('\n') id_, abbr = lines[0].strip().split(':') id_ = id_.split('.')[0] description = unescape('\n'.join(lines[1:])) name = description.split('.')[0] if id_ == 'hedvig-tirailleur': id_ = u'skirgard' if slug(id_) not in provider_map: args.log.info('adding provider %s' % slug(id_)) DBSession.add( Provider(id=slug(id_), name=name, description=description, abbr=abbr))
def insert(db, table, model, value, start=0, order=None): log.info('migrating %s ...' % table) sql = 'select * from %s' % table values = [] if not order: log.info(sql) values = [ value(start + i + 1, row) for i, row in enumerate(db.execute(sql)) ] DBSession.execute(model.__table__.insert(), values) else: def handler(offset, batch): _values = [ value(start + offset + i + 1, row) for i, row in enumerate(batch) ] DBSession.execute(model.__table__.insert(), _values) values.extend(_values) order = [order] if isinstance(order, basestring) else order select(db, '%s order by %s' % (sql, ', '.join(order)), handler) DBSession.execute('COMMIT') log.info('... done') return values
def update_lang(lang, **kw): """ store original name in hname .. notes:: We don't update the alternative names (for name search) here, instead, the script to update these names in bulk must be run after this function. """ name = kw.pop('name', None) if name and name != lang.name: if 'hname' not in lang.jsondatadict: lang.update_jsondata(hname=lang.name) print 'renamed', lang.name, 'to', name lang.name = name print lang.jsondata for k, v in kw.items(): if k not in lang.datadict(): DBSession.add(Language_data(key=k, value=v, object_pk=lang.pk)) else: for d in lang.data: if d.key == k and d.value != v: print 'updated', k d.value = v break
def test_crud(db): migration = Connection(DBSession) assert len(list(migration.select(common.Identifier))) == 0 pk = migration.insert(common.Identifier, id='iso-csw', name='csw', type=common.IdentifierType.iso.value) assert migration.pk(common.Identifier, 'iso-csw') == pk assert len(list(migration.select(common.Identifier))) == 1 identifier = DBSession.query(common.Identifier)\ .options(undefer('*')).get(pk) assert identifier.active assert identifier.version == 1 assert identifier.created assert identifier.updated migration.update(common.Identifier, [('name', 'cea')], pk=pk) DBSession.refresh(identifier) assert identifier.name == 'cea' migration.delete(common.Identifier, pk=pk) with pytest.raises(InvalidRequestError): DBSession.refresh(identifier)
def main(args): datadir = '/home/robert/venvs/glottobank/lexibank' with transaction.manager: dataset = common.Dataset( id=lexibank.__name__, name="LexiBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexibank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for provider in [ 'transnewguinea', 'abvd', 'ids', ]: import_cldf(os.path.join(datadir, provider, 'cldf'), provider) with transaction.manager: load_families(Data(), DBSession.query(LexibankLanguage), isolates_icon='tcccccc')
def macroareas(args, languages, stats): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, info in get_lginfo(args, lambda x: x.macro_area): if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] a, r = update_relationship(languages[hid].macroareas, [ma_map[info.macro_area]]) if a or r: stats.update(['macroarea']) for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == true()): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk)\ .filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) a, r = update_relationship(family.macroareas, mas) if a or r: stats.update(['macroarea']) args.log.info('macroareas done')
def main(args): data = Data() dataset = common.Dataset( id=culturebank.__name__, name="CultureBank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='culturebank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'No license yet'}) # Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) import_features_collaborative_sheet(CULTUREBANK_REPOS, data) import_cldf(os.path.join(CULTUREBANK_REPOS, 'datasets'), data) ##import_cldf("C:\\python27\\dbs\\bwohh\\", data, add_missing_features = True) load_families( data, list(data['CulturebankLanguage'].values()), isolates_icon='tcccccc') return
def handler(offset, batch): _values = [ value(start + offset + i + 1, row) for i, row in enumerate(batch) ] DBSession.execute(model.__table__.insert(), _values) values.extend(_values)
def langdoccomplexquery(request): res = { 'dt': None, 'doctypes': DBSession.query(Doctype).order_by(Doctype.id), 'macroareas': DBSession.query(Macroarea).order_by(Macroarea.id), 'ms': {} } for name, cls, kw in [ ('languoids', LanguoidsMultiSelect, dict(url=request.route_url('glottolog.childnodes'))), ('macroareas', MultiSelect, dict(collection=res['macroareas'])), ('doctypes', MultiSelect, dict(collection=res['doctypes'])), ]: res['ms'][name] = cls(request, name, 'ms' + name, **kw) res['params'], reqparams = get_params(request.params, **res) res['refs'] = getRefs(res['params']) if res['refs']: res['dt'] = Refs(request, Source, cq=1, **reqparams) fmt = request.params.get('format') if fmt: db = bibtex.Database([ref.bibtex() for ref in res['refs']]) for name, adapter in request.registry.getAdapters([db], IRepresentation): if name == fmt: return adapter.render_to_response(db, request) return HTTPNotAcceptable() return res
def import_cognatesets(dataset, forms, bibliography, contribution, cognatesets={}): cognateset_by_formid = {} cognateset_forms = {} for row in dataset["CognateTable"].iterdicts(): # Only incorporate the newest cognate codings, and be robust about that try: cs = cognateset_forms.setdefault(row["Cognateset_ID"], []) cs.append(forms[row["Form_ID"]].name) row["CognateForms"] = cs cognateset_by_formid[row["Form_ID"]] = row except KeyError: continue for row in cognateset_by_formid.values(): cognateset_id = row["Cognateset_ID"] try: cognateset = cognatesets[cognateset_id] except KeyError: row["CognateForms"].sort() cognateset = cognatesets[cognateset_id] = Cognateset( id=row["Cognateset_ID"], contribution=contribution, name=row["CognateForms"][len(row["CognateForms"])//2]) assoc = ( CognatesetCounterpart( cognateset=cognateset, doubt=True if "LexStat" in row["Source"] else False, alignment=(None if not row["Alignment"] else " ".join(row["Alignment"])), counterpart=forms[row["Form_ID"]])) for source in row["Source"]: DBSession.add(CognatesetCounterpartReference( cognatesetcounterpart_pk=assoc.pk, source=bibliography[source]))
def glottologmeta(request): q = DBSession.query(Languoid) qt = q.filter(Languoid.father_pk == null()) res = { 'last_update': DBSession.query(Language.updated).order_by( Language.updated.desc()).first()[0], 'number_of_families': qt.filter(Languoid.level == LanguoidLevel.family).count(), 'number_of_isolates': qt.filter(Languoid.level == LanguoidLevel.language).count(), } bookkeeping = DBSession.query(Language).filter( Language.name == 'Bookkeeping').one() ql = q.filter( and_(Languoid.level == LanguoidLevel.language, Languoid.family_pk != bookkeeping.pk)) res['number_of_languages'] = {'all': ql.count()} res['special_families'] = OrderedDict() res['number_of_languages']['l1'] = res['number_of_languages']['all'] for name in SPECIAL_FAMILIES: l = qt.filter(Language.name == name).one() res['special_families'][name] = l res['number_of_languages'][name] = l.child_language_count res['number_of_languages']['l1'] -= l.child_language_count return res
def update_providers(args, verbose=False): filepath = args.data_dir.joinpath('references', 'bibtex', 'BIBFILES.ini') p = RawConfigParser() with io.open(filepath, encoding='utf-8-sig') as fp: p.readfp(fp) provider_map = get_map(Provider) for section in p.sections(): sectname = section[:-4] if section.endswith('.bib') else section id_ = slug(sectname) attrs = { 'name': p.get(section, 'title'), 'description': p.get(section, 'description'), 'abbr': p.get(section, 'abbr'), } if id_ in provider_map: provider = provider_map[id_] for a in list(attrs): before, after = getattr(provider, a), attrs[a] if before == after: del attrs[a] else: setattr(provider, a, after) attrs[a] = (before, after) if attrs: args.log.info('updating provider %s %s' % (slug(id_), sorted(attrs))) if verbose: for a, (before, after) in attrs.items(): before, after = (' '.join(_.split()) for _ in (before, after)) if before != after: args.log.info('%s\n%r\n%r' % (a, before, after)) else: args.log.info('adding provider %s' % slug(id_)) DBSession.add(Provider(id=id_, **attrs))
def import_sources(wordlist, contribution, contributors = {}): """Load the bibliography """ contributions = {} by_name = {} for source in wordlist.sources.items(): fields = source.entry.fields # Generate a citation from the source citation_contrib = None for role, people in source.entry.persons.items(): if not people: continue names = " and ".join(map(str, people)) fields[role] = names if not citation_contrib: if len(people) == 1: citation_contrib = " ".join(people[0].last_names) elif len(people) == 2: citation_contrib = "{:} & {:}".format(" ".join(people[0].last_names), " ".join(people[1].last_names)) else: citation_contrib = "{:} et al.".format(" ".join(people[0].last_names)) if citation_contrib: if fields.get("year"): name = "{:}, {:}".format(citation_contrib, fields["year"]) else: name = "{:}".format(citation_contrib) else: title_like = fields.get("title") or fields.get("note") if fields.get("year"): name = "{:}, {:}".format(title_like, fields["year"]) else: name = "{:}".format(title_like) if name in by_name: name = "{:}a".format(name) while name in by_name: name = name[:-1]+chr(ord(name[-1]) + 1) # create a contribution contrib = LexiRumahSource( id=source.id, name=name, bibtex_type=vars(EntryType).get(source.genre) or EntryType.misc, provider=contribution) for key, value in fields.items(): if hasattr(contrib, key) and not getattr(contrib, key): setattr(contrib, key, value) else: contrib.jsondata[key] = value DBSession.add(contrib) contributions[source.id] = contrib by_name[name] = contrib return contributions
def match_obsolete_refs(args): with open(args.data_file(args.version, 'obsolete_refs.json')) as fp: refs = json.load(fp) matched = args.data_file(args.version, 'obsolete_refs_matched.json') if matched.exists(): with open(matched) as fp: matched = json.load(fp) else: matched = {} # # TODO: optionally re-evaluate known-unmatched refs! # count = 0 f, m = 0, 0 for id_ in refs: if id_ in matched: continue count += 1 if count > 1000: print '1000 obsolete refs processed!' break ref = Ref.get(id_) found = False if ref.description and len(ref.description) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.description.contains(ref.description))\ .filter(or_(Source.author == ref.author, Source.year == ref.year))\ .limit(10): print '++', ref.id, '->', match.id, '++', ref.author, '->', match.author, '++', ref.year, '->', match.year matched[ref.id] = match.id found = True break if not found and ref.name and len(ref.name) > 5: for match in DBSession.query(Ref)\ .filter(not_(Source.id.in_(refs)))\ .filter(Source.name == ref.name)\ .limit(10): try: if match.description and ref.description and slug(match.description) == slug(ref.description): print '++', ref.id, '->', match.id, '++', ref.description, '->', match.description matched[ref.id] = match.id found = True break except AssertionError: continue if not found: m += 1 print '--', ref.id, ref.name, ref.description matched[ref.id] = None else: f += 1 print f, 'found' print m, 'missed' with open(args.data_file(args.version, 'obsolete_refs_matched.json'), 'w') as fp: json.dump(matched, fp)
def main(args): data = Data() dataset = common.Dataset( id=cdk.__name__, name="CDK", description="Comprehensive Dictionary of Ket", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cdk.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='ket', name=dataset.name) DBSession.add(contrib) for i, (id, name) in enumerate([ ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'), ]): dataset.editors.append( common.Editor(contributor=common.Contributor(id=id, name=name), ord=i)) ket = data.add( common.Language, 'ket', id='ket', name='Ket', latitude=63.76, longitude=87.55) add_language_codes(data, ket, 'ket', glottocode='kett1243') for abbr, name in DIALECTS.items(): data.add(common.Language, abbr, id=abbr, name=name) with args.data_file('sources.txt').open(encoding='utf8') as fp: for i, chunk in enumerate(fp.read().split('\n\n\n')): try: id_, year, author, desc = chunk.split('\n') except: print(chunk) raise data.add( common.Source, id_, id=str(i + 1), name=id_, author=author, year=year, description=desc) with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader: load(data, reader, ket, contrib, verbs=False) with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader: load(data, reader, ket, contrib) print('parsing examples problematic in %s cases' % len(PROBLEMS))
def test_CustomModelMixin(self): from clld.tests.fixtures import CustomLanguage DBSession.add(CustomLanguage(id='abc', name='Name', custom='c')) DBSession.flush() for lang in DBSession.query(Language).filter(Language.id == 'abc'): self.assertEqual(lang.custom, 'c') break
def setup_session(config_uri, engine=None): setup_logging(config_uri) settings = get_appsettings(config_uri) engine = engine or engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) VersionedDBSession.configure(bind=engine) Base.metadata.create_all(engine) return path(config_uri.split('#')[0]).abspath().dirname().basename()
def get_sticks(source): res = {} obj_pks = DBSession.query(Contribution_files.object_pk).filter( Contribution_files.name == source.name).distinct() q = DBSession.query(Contribution).filter( Contribution.pk.in_(obj_pks)).distinct() res[Contribution.__name__.lower()] = q.all() return res
def setup_session(config_uri, engine=None): setup_logging(config_uri) settings = get_appsettings(config_uri) engine = engine or engine_from_config(settings, "sqlalchemy.") DBSession.configure(bind=engine) VersionedDBSession.configure(bind=engine) Base.metadata.create_all(engine) return Path(config_uri.split("#")[0]).resolve().parent.name
def test_Dataset(self): from clld import RESOURCES from clld.db.models.common import Dataset, Source d = Dataset(id='abc', domain='test') DBSession.add(d) DBSession.flush() d.get_stats(RESOURCES, source=Source.id == None)
def quicksearch(request): message = None query = DBSession.query(Languoid) term = request.params['search'].strip() titlecase = term.istitle() term = term.lower() params = {'iso': '', 'country': '', 'name': '', 'namequerytype': 'part', 'multilingual': ''} if not term: query = None elif len(term) < 3: query = None message = ('Please enter at least four characters for a name search ' 'or three characters for an iso code') elif len(term) == 3 and not titlecase: query = query.filter(Languoid.identifiers.any( type=IdentifierType.iso.value, name=term)) kind = 'ISO 639-3' elif len(term) == 8 and GLOTTOCODE_PATTERN.match(term): query = query.filter(Languoid.id == term) kind = 'Glottocode' else: _query = query.filter(func.lower(Languoid.name) == term) if DBSession.query(_query.exists()).scalar(): query = _query else: query = query.filter(or_( func.lower(Languoid.name).contains(term), Languoid.identifiers.any(and_( Identifier.type == u'name', Identifier.description == Languoid.GLOTTOLOG_NAME, func.lower(Identifier.name).contains(term))))) kind = 'name part' params['name'] = term if query is None: languoids = [] else: languoids = query.order_by(Languoid.name)\ .options(joinedload(Languoid.family)).all() if not languoids: term_pre = HTML.kbd(term, style='white-space: pre') message = 'No matching languoids found for %s "' % kind + term_pre + '"' elif len(languoids) == 1: raise HTTPFound(request.resource_url(languoids[0])) map_, icon_map, family_map = get_selected_languages_map(request, languoids) layer = list(map_.get_layers())[0] if not layer.data['features']: map_ = None countries = json.dumps(['%s (%s)' % (c.name, c.id) for c in DBSession.query(Country).order_by(Country.description)]) return {'message': message, 'params': params, 'languoids': languoids, 'map': map_, 'countries': countries}
def language_detail_html(context=None, request=None, **kw): # makes sure all display elements have a value param_word = {p.id: '#' for p in DBSession.query(Parameter)} # override the param_word dict with values from the DB for word in DBSession.query(Value)\ .join(ValueSet)\ .filter(ValueSet.language_pk == context.pk)\ .options(joinedload_all(Value.valueset, ValueSet.parameter)): param_word[word.valueset.parameter.id] = word.name def thead(*cols): return HTML.thead( HTML.tr( HTML.th("", style="height:26px; font-weight:"), *[HTML.th(col) for col in cols], **dict(style="background: #F2F2F2") ) ) def td(p): return HTML.td(param_word.get(p, '') if p else '') def tr(name, *params): return HTML.tr( HTML.td( name, style="height:26px; font-weight: bold; background: #F2F2F2; padding: 5px"), *[td(p) for p in params]) def table(*cols, **kw): male_cols = kw.get('male', ['m' + col for col in cols]) female_cols = kw.get('female', ['f' + col for col in cols]) return HTML.table( thead(*cols), HTML.tbody(tr('male', *male_cols), tr('female', *female_cols))) # create a paradigm_tables dict for the HTML rendering paradigm_tables = { 'pronouns': HTML.table( thead("A", "S", "O", "P"), HTML.tbody( tr('1st (excl) Person Singular', '1sg_a', '1sg_s', '1sg_o', '1sg_p'), tr('1st (excl) Person Dual', '1du_a', '1du_s', '1du_o', '1du_p'), tr('1st (excl) Person Plural', '1pl_a', '1pl_s', '1pl_o', '1pl_p'), tr('1st (incl) Person Dual', '12du_a', '12du_s', '12du_o', '12du_p'), tr('1st (incl) Person Plural', '12pl_a', '12pl_s', '12pl_o', '12pl_p'), tr('2nd Person Singular', '2sg_a', '2sg_s', '2sg_o', '2sg_p'), tr('2nd Person Dual', '2du_a', '2du_s', '2du_o', '2du_p'), tr('2nd Person Plural', '2pl_a', '2pl_s', '2pl_o', '2pl_p'), tr('3rd Person Singular Gender 1', '3sg_gen1_a', '3sg_gen1_s', '3sg_gen1_o', '3sg_gen1_p'), tr('3rd Person Singular Gender 2', '3sg_gen2_a', '3sg_gen2_s', '3sg_gen2_o', '3sg_gen2_p'), tr('3rd Person Dual', '3du_gen1_a', '3du_gen1_s', '3du_gen1_o', '3du_gen1_p'), tr('3rd Person Plural', '3pl_gen1_a', '3pl_gen1_s', '3pl_gen1_o', '3pl_gen1_p'), ) ), } return paradigm_tables
def query(self, req): self._domainelements = DBSession.query(DomainElement).all() return DBSession.query(Language)\ .order_by(Language.id)\ .options( subqueryload_all('languageidentifier', 'identifier'), subqueryload_all('countries'), joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(WalsLanguage.genus, Genus.family))
def setUp(self): from clld.tests.fixtures import CustomLanguage assert CustomLanguage engine = create_engine('sqlite://') DBSession.configure(bind=engine) VersionedDBSession.configure(bind=engine) Base.metadata.bind = engine Base.metadata.create_all()
def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict(pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues)
def create_identifier(identifier, l, **kw): global MAX_IDENTIFIER_PK if identifier is None: MAX_IDENTIFIER_PK += 1 DBSession.add(Identifier(pk=MAX_IDENTIFIER_PK, id=str(MAX_IDENTIFIER_PK), **kw)) pk = MAX_IDENTIFIER_PK else: pk = identifier.pk DBSession.add(LanguageIdentifier(language_pk=l.pk, identifier_pk=pk))