'Creative Commons Attribution 4.0 International License', }, domain='crubadan.org', ) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Kevin Scannell", id="Kevin Scannell", name="Kevin Scannell", email="*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() fillTable(DBSession) def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodiucally whenever data has been updated. """ if __name__ == '__main__': prepSysDirs() initializedb(create=main, prime_cache=prime_cache) sys.exit(0)
# we compute the ancestry for each single languoid for lid, fid in DBSession.execute( 'select pk, father_pk from languoid').fetchall(): depth = 0 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth)) # now follow up the line of ancestors while fid: depth += 1 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=fid, depth=depth)) fid = DBSession.execute( sql.select([ltable.c.father_pk ]).where(ltable.c.pk == fid)).fetchone()[0] # we also pre-compute counts of descendants for each languoid: for level in ['language', 'dialect', 'family']: DBSession.execute("""\ UPDATE languoid SET child_%(level)s_count = ( SELECT count(*) FROM treeclosuretable as t, languoid as l WHERE languoid.pk = t.parent_pk AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s' )""" % locals()) DBSession.execute('COMMIT') if __name__ == '__main__': initializedb(create=create, prime_cache=prime_cache)
for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter ): parameter.representation = str(len(set(v.language_pk for v in valuesets))) print("recomputation of representation done") transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options( joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier) ): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ", ".join(sorted(set(iso_codes))) print("ecomputation of iso codes done") transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func("update", args) if __name__ == "__main__": # pragma: no cover initializedb(create=main, prime_cache=prime_cache)
it will have to be run periodically whenever data has been updated. """ for vs in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.values)): d = [] for generic_term, words in groupby( sorted(vs.values, key=lambda v: v.description), key=lambda v: v.description ): if generic_term: generic_term += ': ' else: generic_term = '' d.append(generic_term + ', '.join(nfilter([w.name for w in words]))) vs.description = '; '.join(d) for model in [models.Country, models.Ecoregion]: for instance in DBSession.query(model).options( joinedload(getattr(model, 'taxa')) ): if not instance.taxa: instance.active = False # TODO: assign ThePlantList ids! if __name__ == '__main__': initializedb( (('data_repos',), dict(action=ExistingDir)), create=main, prime_cache=prime_cache) sys.exit(0)
choices = {} for col in d.jsondata.get('custom_fields', []): values = [ r[0] for r in DBSession.query(common.Unit_data.value) .filter(common.Unit_data.object_pk.in_(word_pks)) .filter(common.Unit_data.key == col) .distinct()] if len(values) < 40: choices[col] = sorted(values) d.update_jsondata(choices=choices) DBSession.execute(""" UPDATE word SET example_count = s.c FROM ( SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c FROM meaning AS m, meaningsentence AS ms WHERE m.pk = ms.meaning_pk GROUP BY m.word_pk ) AS s WHERE word.pk = s.wpk """) if __name__ == '__main__': initializedb( (("--internal",), dict(action='store_true')), (("--no-concepts",), dict(action='store_true')), (("--dict",), dict()), create=main, prime_cache=prime_cache)
choices = {} for col in d.jsondata.get('custom_fields', []): values = [ r[0] for r in DBSession.query(common.Unit_data.value) .filter(common.Unit_data.object_pk.in_(word_pks)) .filter(common.Unit_data.key == col) .distinct()] if len(values) < 40: choices[col] = sorted(values) d.update_jsondata(choices=choices) DBSession.execute(""" UPDATE word SET example_count = s.c FROM ( SELECT m.word_pk AS wpk, count(ms.sentence_pk) AS c FROM meaning AS m, meaningsentence AS ms WHERE m.pk = ms.meaning_pk GROUP BY m.word_pk ) AS s WHERE word.pk = s.wpk """) if __name__ == '__main__': initializedb( (("--internal",), dict(action='store_true')), (("--no-concepts",), dict(action='store_true')), (("--dict",), dict()), create=main, prime_cache=prime_cache, bootstrap=True)
languages = import_languages() import_cldf("datasets", features, languages, trust=trust) if languages_path not in trust: languages.to_csv(languages_path, sep='\t', encoding='utf-8') if features_path not in trust: features.to_csv(features_path, sep='\t', encoding='utf-8') import sys sys.argv = ["i", "../grambank/sqlite.ini"] if model_is_available: from clld.scripts.util import initializedb from clld.db.util import compute_language_sources try: initializedb(create=main, prime_cache=lambda x: None) except SystemExit: print("done") else: parser = argparse.ArgumentParser( description="Process GramRumah data with consistency in mind") parser.add_argument("--sqlite", default=None, const="gramrumah.sqlite", nargs="?", help="Generate an sqlite database from the data") parser.add_argument("--trust", "-t", nargs="*", type=argparse.FileType("r"), default=[],
initializedb's instance in the undocumented and weird way that seems to have been provided for such cases. The latter option is implemented. """ if __name__ == '__main__': if os.path.exists('db.sqlite'): os.remove('db.sqlite') main_data_arg = [('main_data', ), { 'help': 'path to the tsv file that contains the TuLeD data' }] lang_data_arg = [('lang_data', ), { 'help': 'path to the tsv file that contains the language data' }] concept_data_arg = [('concept_data', ), { 'help': 'path to the tsv file that contains the concept data' }] sources_data_arg = [('sources_data', ), { 'help': 'path to the bibtex file that contains the references' }] initializedb(main_data_arg, lang_data_arg, concept_data_arg, sources_data_arg, create=main)
ltable = models2.Languoid.__table__ # we compute the ancestry for each single languoid for lid, fid in DBSession.execute('select pk, father_pk from languoid').fetchall(): depth = 0 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=lid, depth=depth)) # now follow up the line of ancestors while fid: depth += 1 DBSession.execute(SQL, dict(child_pk=lid, parent_pk=fid, depth=depth)) fid = DBSession.execute( sql.select([ltable.c.father_pk]).where(ltable.c.pk == fid) ).fetchone()[0] # we also pre-compute counts of descendants for each languoid: for level in ['language', 'dialect', 'family']: DBSession.execute("""\ UPDATE languoid SET child_%(level)s_count = ( SELECT count(*) FROM treeclosuretable as t, languoid as l WHERE languoid.pk = t.parent_pk AND languoid.pk != t.child_pk AND t.child_pk = l.pk AND l.level = '%(level)s' )""" % locals()) DBSession.execute('COMMIT') if __name__ == '__main__': initializedb(create=create, prime_cache=prime_cache)
lemma_pattern = re.compile("(?P<cf>Cf\.\s*)?‘(?P<lemma>[^’]+)’", re.MULTILINE) def language_repl(m): return '**%s**' % m.group('id') language_pattern = re.compile('(?P<id>%s)' % '|'.join(k.upper() for k in LANGUAGES.keys())) for entry in entries.values(): if entry.description: #print ('\\lx %s' % entry.name).encode('utf8') entry.description = lemma_pattern.sub(lemma_repl, entry.description) entry.description = language_pattern.sub(language_repl, entry.description) print 'hits:', len(hit) print 'miss:', len(miss) def level(l): _level = 0 while l.parent: _level += 1 l = l.parent return _level for lang in DBSession.query(models.Languoid): lang.level = level(lang) if __name__ == '__main__': initializedb(create=main, prime_cache=prime_cache, bootstrap=True) sys.exit(0)