def test_init3(self): # with kw check=True bad_file = Path(test_data('bad_file.tsv')) assert_raises(ValueError, LexStat, bad_file.as_posix()) ls = self._make_one(bad_file.as_posix(), check=True, apply_checks=True) assert hasattr(ls, 'errors') cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.tsv') self.assertTrue(cleaned.exists()) os.remove(cleaned.as_posix()) assert_raises(ValueError, LexStat, {0: ['concept', 'language', 'ipa']})
def _download_sql_dump(rel, log): target = Path('glottolog-{0}.sql.gz'.format(rel['version'])) log.info('retrieving {0}'.format(rel['sql_dump_url'])) urlretrieve(rel['sql_dump_url'], target.as_posix()) assert md5(target) == rel['sql_dump_md5'] unpacked = target.with_suffix('') with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u: shutil.copyfileobj(f, u) target.unlink() log.info('SQL dump for Glottolog release {0} written to {1}'.format( rel['version'], unpacked))
def create(self, dir_, content): """Write ``content`` to a file using ``dir_`` as file-system directory. :return: File-system path of the file that was created. """ p = Path(dir_).joinpath(self.relpath) if not p.parent.exists(): p.parent.mkdir(parents=True) with open(p.as_posix(), 'wb') as fp: if isinstance(content, text_type): content = content.encode('utf8') fp.write(content) return p.as_posix()
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path('tree') old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError('please remove %s before proceeding' % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def download_tables(outdir=None): match = ZIP_NAME_PATTERN.search(urlopen(BASE_URL + 'download.asp').read()) if not match: raise ValueError('no matching zip file name found') # pragma: no cover target = Path(outdir or '.').joinpath(match.group('name')) urlretrieve(BASE_URL + match.group('name'), target.as_posix()) return target
def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False): outdir = Path(outdir) if not outdir.exists(): raise ValueError(outdir.as_posix()) close = False if archive: if isinstance(archive, Archive): container = archive else: container = Archive(outdir.joinpath(self.name + '.zip'), mode='w') close = True else: container = outdir fname = Path(outdir).joinpath(self.name + suffix) if fname.suffix in TAB_SUFFIXES: self.table.dialect.delimiter = '\t' with UnicodeWriter( None if isinstance(container, Archive) else fname, delimiter=self.table.dialect.delimiter) as writer: writer.writerow(self.fields) for row in self.rows: writer.writerow(row.to_list()) if isinstance(container, Archive): container.write_text(writer.read(), fname.name) self.table.url = fname.name self.metadata.write(Dataset.filename(fname, 'metadata'), container) ids = self._cited_sources if cited_sources_only else None self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids) if close: container.close()
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path("tree") old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError("please remove %s before proceeding" % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError("unattached dialect") # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
class TemporaryPath(object): def __init__(self, suffix=''): fp = NamedTemporaryFile(suffix=suffix) self.name = Path(fp.name) fp.close() def __enter__(self): return self.name.as_posix() def __exit__(self, exc_type, exc_val, exc_tb): if self.name.exists(): remove(self.name)
def dependencies_graph(imps): deps = dict([((f1, f2), v) for (v, f1, f2) in imps if v > 0.0]) V = set([f for fs in deps.iterkeys() for f in fs]) G = dict([(k, v) for (k, v) in deps.items() if v > 0.0]) MSTs = [mst(G, x) for x in V] (mv, H) = max([(sum(H.values()), H) for H in MSTs]) #W = dict([(y, 1.0-v) for ((x, y), v) in H.iteritems()]) #sav(dot(H, V), 'grambank_mst.gv') path = Path(grambank.__file__).parent.joinpath('static', 'dependencies.gv') with open(path.as_posix(), 'w') as fp: fp.write(dot(H, V)) return (H, V) #dot(H, V)
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': { 'basename': 'abcdefg' }, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix() } }) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing(GzipFile( filename=Path(tmp.stem).stem, fileobj=tmp.open('wb') )) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * ( len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': {'basename': 'abcdefg'}, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix()}}) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing( GzipFile(filename=Path(tmp.stem).stem, fileobj=tmp.open('wb'))) as fp: self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * (len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def test_extractor(config_factory, tmppath, data_dir): config = config_factory("admin", "mk", "embed_data") xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) assert bool(_extract(xmlfile)) config = config_factory({ 'admin': {'basename': 'abcdefg'}, 'model model': { 'model': 'mk', 'data': str(data_dir / 'basic.csv')}}) xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') assert p.exists() cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) assert cfg['admin']['basename'] == 'abcdefg' assert cfg['model model']['model'] == 'mk' fname = tmppath / 'test.xml' datafile = tmppath / 'test.csv' assert not datafile.exists() with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = _extract(fname) assert datafile.name in ''.join(res)
def _str_path(path, mkdir=False): """Get a file-system path as text_type, suitable for passing into io.open. Parameters ---------- path : {text_type, Path} A fs path either as Path instance or as text_type. mkdir : bool (default=False) If True, create the directories within the path. Returns ------- path : text_type The path as text_type. """ res = Path(path_component(path)) if mkdir and res.parent and not res.parent.exists(): res.parent.mkdir(parents=True) return res.as_posix()
def contribution_detail_html(context=None, request=None, **kw): if context.id == 's4': raise HTTPFound(request.route_url('genealogy')) p = Path(wals3.__file__).parent.joinpath( 'static', 'descriptions', str(context.id), 'body.xhtml') c = codecs.open(p.as_posix(), encoding='utf8').read() adapter = get_adapter(IRepresentation, Feature(), request, ext='snippet.html') for feature in DBSession.query(Feature)\ .filter(Feature.contribution_pk == context.pk)\ .options(joinedload_all(Feature.domain, DomainElement.values)): table = soup(adapter.render(feature, request)) values = '\n'.join('%s' % table.find(tag).extract() for tag in ['thead', 'tbody']) c = c.replace('__values_%s__' % feature.id, values) return {'text': c.replace('http://wals.info', request.application_url)}
def contribution_detail_html(context=None, request=None, **kw): if context.id == 's4': raise HTTPFound(request.route_url('genealogy')) p = Path(wals3.__file__).parent.joinpath('static', 'descriptions', str(context.id), 'body.xhtml') c = codecs.open(p.as_posix(), encoding='utf8').read() adapter = get_adapter(IRepresentation, Feature(), request, ext='snippet.html') for feature in DBSession.query(Feature)\ .filter(Feature.contribution_pk == context.pk)\ .options(joinedload_all(Feature.domain, DomainElement.values)): table = soup(adapter.render(feature, request)) values = '\n'.join('%s' % table.find(tag).extract() for tag in ['thead', 'tbody']) c = c.replace('__values_%s__' % feature.id, values) return {'text': c.replace('http://wals.info', request.application_url)}
def rewrite(fname, visitor, **kw): """Utility function to rewrite rows in tsv files. :param fname: Path of the dsv file to operate on. :param visitor: A callable that takes a line-number and a row as input and returns a \ (modified) row or None to filter out the row. :param kw: Keyword parameters are passed through to csv.reader/csv.writer. """ if not isinstance(fname, Path): assert isinstance(fname, string_types) fname = Path(fname) assert fname.is_file() tmp = fname.parent.joinpath('.tmp.' + fname.name) with UnicodeReader(fname, **kw) as reader_: with UnicodeWriter(tmp, **kw) as writer: for i, row in enumerate(reader_): row = visitor(i, row) if row is not None: writer.writerow(row) shutil.move(tmp.as_posix(), fname.as_posix())
def from_ini(cls, ini, nodes=None): nodes = nodes or {} ini = Path(ini) directory = ini.parent cfg = INI(interpolation=None) cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory) nodes[res.id] = (res.name, res.id, res.level) return res
def from_ini(cls, ini, nodes={}): if not isinstance(ini, Path): ini = Path(ini) directory = ini.parent cfg = INI() cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name.split('.')[-1] assert id_ != directory.name.split('.')[-1] if not cls.id_pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage))) nodes[res.id] = (res.name, res.id, res.level) return res
def from_ini(cls, ini, nodes=None): if nodes is None: nodes = {} ini = Path(ini) directory = ini.parent cfg = INI(interpolation=None) cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory) nodes[res.id] = (res.name, res.id, res.level) return res
class Database(object): def __init__(self, fname): """ A `Database` instance is initialized with a file path. :param fname: Path to a file in the file system where the db is to be stored. """ self.fname = Path(fname) def drop(self): if self.fname.exists(): remove(self.fname) def connection(self): return closing(sqlite3.connect(self.fname.as_posix())) def create(self, force=False, exists_ok=False): """ Creates a db file with the core schema. :param force: If `True` an existing db file will be overwritten. """ if self.fname and self.fname.exists(): if force: self.drop() elif exists_ok: return else: raise ValueError('db file already exists, use force=True to overwrite') with self.connection() as db: db.execute( """\ CREATE TABLE dataset ( ID TEXT PRIMARY KEY NOT NULL, name TEXT, version TEXT, metadata_json TEXT )""") db.execute("""\ CREATE TABLE datasetmeta ( dataset_ID TEXT , key TEXT, value TEXT, PRIMARY KEY (dataset_ID, key), FOREIGN KEY(dataset_ID) REFERENCES dataset(ID) )""") db.execute("""\ CREATE TABLE SourceTable ( dataset_ID TEXT , ID TEXT , bibtex_type TEXT, {0} extra TEXT, PRIMARY KEY (dataset_ID, ID), FOREIGN KEY(dataset_ID) REFERENCES dataset(ID) )""".format('\n '.join('`{0}` TEXT,'.format(f) for f in BIBTEX_FIELDS))) def fetchone(self, sql, params=None, conn=None, verbose=False): return self._fetch(sql, 'fetchone', params, conn, verbose=verbose) def fetchall(self, sql, params=None, conn=None, verbose=False): return self._fetch(sql, 'fetchall', params, conn, verbose=verbose) def _fetch(self, sql, method, params, conn, verbose=False): sql = self.sql.get(sql, sql) def _do(conn, sql, method): cu = conn.cursor() if verbose: print(sql) cu.execute(sql, params or ()) return getattr(cu, method)() if not conn: with self.connection() as conn: return _do(conn, sql, method) else: return _do(conn, sql, method) @property def tables(self): res = {r[0]: {} for r in self.fetchall( "SELECT name FROM sqlite_master WHERE type='table'")} for t in res: res[t] = {r[1]: r[2] for r in self.fetchall( "PRAGMA table_info({0})".format(t))} return res def unload(self, dataset_id): dataset_id = getattr(dataset_id, 'id', dataset_id) with self.connection() as db: for table in self.tables: if table != 'dataset': db.execute( "DELETE FROM {0} WHERE dataset_ID = ?".format(table), (dataset_id,)) db.execute("DELETE FROM dataset WHERE ID = ?", (dataset_id,)) db.commit() def _create_table_if_not_exists(self, table): if table.name in self.tables: return False with self.connection() as conn: conn.execute(table.sql) return True def load(self, ds, verbose=False): """ Load a CLDF dataset into the database. :param dataset: :return: """ try: self.fetchone('select ID from dataset') except sqlite3.OperationalError: self.create(force=True) self.unload(ds) dataset = ds.cldf.wl tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = self.tables[t.name] for col in t.columns: if col.name not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format( t.name, col)) else: if db_cols[col.name] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'.format( t.name, col.name, db_cols[col.name], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) self.update_schema() # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') insert( db, 'dataset', 'ID,name,version,metadata_json', ( ds.id, '{0}'.format(dataset), git_hash(ds.dir), dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS] values.append( dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS})) rows.append(tuple(values)) insert( db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = defaultdict(list) for t in tables: # We want to lookup columns by the name used in the CLDF dataset. cols = {col.cldf_name: col for col in t.columns} # But we also want to look up primary keys by the database column name. cols_by_name = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] try: for row in dataset[t.name]: keys, values = ['dataset_ID'], [ds.id] for k, v in row.items(): if ref_table and k == ref_table.consumes: col = cols_by_name[t.primary_key] refs[ref_table.name].append((row[col.cldf_name], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join( nfilter(col.convert(vv) for vv in v)) else: v = col.convert(v) keys.append("`{0}`".format(col.name)) values.append(v) keys, values = self.update_row(t.name, keys, values) rows.append(tuple(values)) insert(db, t.name, keys, *rows, **{'verbose': verbose}) except FileNotFoundError: if t.name != 'CognateTable': # An empty CognateTable is allowed. raise # pragma: no cover # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([ds.id, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows) db.commit() def update_schema(self): for tname, cname, type_ in [ ('ParameterTable', 'Ontological_Category', 'TEXT'), ('ParameterTable', 'Semantic_Field', 'TEXT'), ('LanguageTable', 'Latitude', 'REAL'), ('LanguageTable', 'Longitude', 'REAL'), ]: if cname not in self.tables[tname]: with self.connection() as conn: conn.execute("ALTER TABLE {0} ADD COLUMN `{1}` {2}".format( tname, cname, type_)) def update_row(self, table, keys, values): return keys, values def load_concepticon_data(self, concepticon): conceptsets = [] for csid in self.fetchall("SELECT distinct concepticon_id FROM parametertable"): cs = concepticon.conceptsets.get(csid[0]) if cs: conceptsets.append(( cs.gloss, cs.ontological_category, cs.semanticfield, cs.id)) with self.connection() as db: db.executemany( """\ UPDATE parametertable SET concepticon_gloss = ?, ontological_category = ?, semantic_field = ? WHERE concepticon_id = ?""", conceptsets) db.commit() def load_glottolog_data(self, glottolog): langs = [] languoids = {l.id: l for l in glottolog.languoids()} for gc in self.fetchall("SELECT distinct glottocode FROM languagetable"): lang = languoids.get(gc[0]) if lang: langs.append(( lang.lineage[0][0] if lang.lineage else lang.name, lang.macroareas[0].name if lang.macroareas else None, lang.latitude, lang.longitude, lang.id)) with self.connection() as db: db.executemany( """\ UPDATE languagetable SET family = ?, macroarea = ?, latitude = ?, longitude = ? WHERE glottocode = ?""", langs) db.commit() sql = { "conceptsets_by_dataset": "SELECT ds.id, count(distinct p.concepticon_id) " "FROM dataset as ds, parametertable as p " "WHERE ds.id = p.dataset_id GROUP BY ds.id", "families_by_dataset": "SELECT ds.id, count(distinct l.family) " "FROM dataset as ds, languagetable as l " "WHERE ds.id = l.dataset_id GROUP BY ds.id", "macroareas_by_dataset": "SELECT ds.id, group_concat(distinct l.macroarea) " "FROM dataset as ds, languagetable as l " "WHERE ds.id = l.dataset_id GROUP BY ds.id", "glottocodes_by_dataset": "SELECT ds.id, count(distinct l.glottocode) " "FROM dataset as ds, languagetable as l " "WHERE ds.id = l.dataset_id GROUP BY ds.id", "mapped_lexemes_by_dataset": "SELECT ds.id, count(distinct f.ID) " "FROM dataset as ds, formtable as f, languagetable as l, parametertable as p " "WHERE ds.id = f.dataset_id and f.Language_ID = l.ID and " "f.Parameter_ID = p.ID and l.glottocode is not null and " "p.concepticon_id is not null " "GROUP BY ds.id", "lexemes_by_dataset": "SELECT ds.id, count(f.ID) FROM dataset as ds, formtable as f " "WHERE ds.id = f.dataset_id GROUP BY ds.id", }
def includeme(config): """Upgrading: - register utilities "by hand", after config.include('clld.web.app') - add routes by hand (and remove these from the **kw passed to Configurator) :param config: :return: """ # # now we exploit the default package layout as created via the CLLD scaffold: # # note: the following exploits the import time side effect of modifying the webassets # environment! root_package = config.root_package.__name__ maybe_import('%s.assets' % root_package) pkg_dir = Path(config.root_package.__file__).parent.resolve() json_renderer = JSON() json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('json', json_renderer) jsonp_renderer = JSONP(param_name='callback') jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('jsonp', jsonp_renderer) config.set_request_factory(ClldRequest) config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery) config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig) # initialize the db connection engine = engine_from_config(config.registry.settings, 'sqlalchemy.') DBSession.configure(bind=engine) Base.metadata.bind = engine config.add_settings({ 'pyramid.default_locale_name': 'en', 'clld.pkg': root_package, 'clld.parameters': {}}) if 'clld.files' in config.registry.settings: # deployment-specific location of static data files abspath = Path(config.registry.settings['clld.files']).resolve() config.add_settings({'clld.files': abspath}) config.add_static_view('files', abspath.as_posix()) # event subscribers: config.add_subscriber(add_localizer, events.NewRequest) config.add_subscriber(init_map, events.ContextFound) config.add_subscriber( partial(add_renderer_globals, maybe_import('%s.util' % root_package)), events.BeforeRender) # # make it easy to register custom functionality # for name, func in { 'register_datatable': partial(register_cls, interfaces.IDataTable), 'register_map': partial(register_cls, interfaces.IMap), 'register_menu': register_menu, 'register_resource': register_resource, 'register_adapter': register_adapter, 'register_adapters': register_adapters, 'register_download': register_download, 'register_staticresource': register_staticresource, 'add_route_and_view': add_route_and_view, 'add_settings_from_file': add_settings_from_file, 'add_301': add_301, 'add_410': add_410, 'add_page': add_page, 'register_resource_routes_and_views': register_resource_routes_and_views, }.items(): config.add_directive(name, func) # # routes and views # config.add_static_view('clld-static', 'clld:web/static') config.add_static_view('static', '%s:static' % root_package) config.add_route_and_view('_js', '/_js', js, http_cache=3600) # add some maintenance hatches config.add_route_and_view('_raise', '/_raise', _raise) config.add_route_and_view('_ping', '/_ping', _ping, renderer='json') # sitemap support: config.add_route_and_view('robots', '/robots.txt', robots) config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex) config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap) config.add_route('resourcemap', '/resourcemap.json') config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp') config.add_route_and_view( 'select_combination', '/_select_combination', select_combination) config.add_route_and_view('unapi', '/unapi', unapi) config.add_route_and_view('olac', '/olac', olac) config.add_settings_from_file(pkg_dir.joinpath('appconf.ini')) if not config.registry.settings.get('mako.directories'): config.add_settings({'mako.directories': ['clld:web/templates']}) for rsc in RESOURCES: config.register_resource_routes_and_views(rsc) config.register_datatable( rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable)) register_resource_adapters(config, rsc) # maps config.register_map('languages', Map) config.register_map('language', LanguageMap) config.register_map('parameter', ParameterMap) config.register_map('combination', CombinationMap) config.include('clld.web.adapters') for icon in ICONS: config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name) config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList) config.registry.registerUtility(MapMarker(), interfaces.IMapMarker) # # inspect default locations for views and templates: # home_comp = OrderedDict() for name, template in [ ('introduction', False), ('about', False), ('terms', False), ('glossary', False), ('history', False), ('changes', False), ('credits', False), ('legal', True), ('download', True), ('contact', True), ('help', False), ]: home_comp[name] = template if pkg_dir.joinpath('templates').exists(): for p in pkg_dir.joinpath('templates').iterdir(): if p.stem in home_comp and p.suffix == '.mako': home_comp[p.stem] = True for name, template in home_comp.items(): if template: config.add_page(name) config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]}) if 'clld.favicon' not in config.registry.settings: favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'} # hard to test (in particular on travis) and without too much consequence # (and the consequences faced are easy to spot). if pkg_dir.joinpath('static', 'favicon.ico').exists(): # pragma: no cover favicon['clld.favicon'] = root_package + ':static/favicon.ico' config.add_settings(favicon) with open(abspath_from_asset_spec( config.registry.settings['clld.favicon']), mode='rb') as fp: fh = md5() fh.update(fp.read()) config.add_settings({'clld.favicon_hash': fh.hexdigest()}) translation_dirs = ['clld:locale'] if pkg_dir.joinpath('locale').exists(): translation_dirs.append('%s:locale' % root_package) # pragma: no cover config.add_translation_dirs(*translation_dirs) if pkg_dir.joinpath('static/publisher_logo.png').exists(): # pragma: no cover config.add_settings( {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package}) if asbool(config.registry.settings.get('clld.pacific_centered_maps')): geojson.pacific_centered() v = maybe_import('%s.views' % root_package) if v: config.scan(v) # pragma: no cover menuitems = config.registry.settings.get( 'clld.menuitems_list', ['contributions', 'parameters', 'languages', 'contributors']) config.register_menu(('dataset', dict(label='Home')), *menuitems) config.include('pyramid_mako') for name in ['adapters', 'datatables', 'maps']: mod = maybe_import('%s.%s' % (root_package, name)) if mod and hasattr(mod, 'includeme'): config.include(mod) config.register_download(CldfDownload(common.Dataset, root_package))
def includeme(config): """Upgrading: - register utilities "by hand", after config.include('clld.web.app') - add routes by hand (and remove these from the **kw passed to Configurator) :param config: :return: """ # # now we exploit the default package layout as created via the CLLD scaffold: # # note: the following exploits the import time side effect of modifying the webassets # environment! root_package = config.root_package.__name__ maybe_import('%s.assets' % root_package) pkg_dir = Path(config.root_package.__file__).parent.resolve() json_renderer = JSON() json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('json', json_renderer) jsonp_renderer = JSONP(param_name='callback') jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('jsonp', jsonp_renderer) config.set_request_factory(ClldRequest) config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery) config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig) # initialize the db connection engine = engine_from_config(config.registry.settings, 'sqlalchemy.') DBSession.configure(bind=engine) Base.metadata.bind = engine config.add_settings({ 'pyramid.default_locale_name': 'en', 'clld.pkg': root_package, 'clld.parameters': {}}) if 'clld.files' in config.registry.settings: # deployment-specific location of static data files abspath = Path(config.registry.settings['clld.files']).resolve() config.add_settings({'clld.files': abspath}) config.add_static_view('files', abspath.as_posix()) # event subscribers: config.add_subscriber(add_localizer, events.NewRequest) config.add_subscriber(init_map, events.ContextFound) config.add_subscriber( partial(add_renderer_globals, maybe_import('%s.util' % root_package)), events.BeforeRender) # # make it easy to register custom functionality # for name, func in { 'register_datatable': partial(register_cls, interfaces.IDataTable), 'register_map': partial(register_cls, interfaces.IMap), 'register_menu': register_menu, 'register_resource': register_resource, 'register_adapter': register_adapter, 'register_adapters': register_adapters, 'register_download': register_download, 'register_staticresource': register_staticresource, 'add_route_and_view': add_route_and_view, 'add_settings_from_file': add_settings_from_file, 'add_301': add_301, 'add_410': add_410, 'register_resource_routes_and_views': register_resource_routes_and_views, }.items(): config.add_directive(name, func) # # routes and views # config.add_static_view('clld-static', 'clld:web/static') config.add_static_view('static', '%s:static' % root_package) config.add_route_and_view('_js', '/_js', js, http_cache=3600) # add some maintenance hatches config.add_route_and_view('_raise', '/_raise', _raise) config.add_route_and_view('_ping', '/_ping', _ping, renderer='json') # sitemap support: config.add_route_and_view('robots', '/robots.txt', robots) config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex) config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap) config.add_route('resourcemap', '/resourcemap.json') config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp') config.add_route_and_view( 'select_combination', '/_select_combination', select_combination) config.add_route_and_view('unapi', '/unapi', unapi) config.add_route_and_view('olac', '/olac', olac) config.add_settings_from_file(pkg_dir.joinpath('appconf.ini')) if not config.registry.settings.get('mako.directories'): config.add_settings({'mako.directories': ['clld:web/templates']}) for rsc in RESOURCES: config.register_resource_routes_and_views(rsc) config.register_datatable( rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable)) register_resource_adapters(config, rsc) # maps config.register_map('languages', Map) config.register_map('language', LanguageMap) config.register_map('parameter', ParameterMap) config.register_map('combination', CombinationMap) config.include('clld.web.adapters') for icon in ICONS: config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name) config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList) config.registry.registerUtility(MapMarker(), interfaces.IMapMarker) # # inspect default locations for views and templates: # home_comp = OrderedDict() for name, template in [ ('introduction', False), ('about', False), ('terms', False), ('glossary', False), ('history', False), ('changes', False), ('credits', False), ('legal', True), ('download', True), ('contact', True), ('help', False), ]: home_comp[name] = template if pkg_dir.joinpath('templates').exists(): for p in pkg_dir.joinpath('templates').iterdir(): if p.stem in home_comp and p.suffix == '.mako': home_comp[p.stem] = True views = maybe_import('%s.views' % root_package) for name, template in home_comp.items(): if template: config.add_route_and_view( name, '/' + name, getattr(views, name, lambda r: {}), renderer=name + '.mako') config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]}) if 'clld.favicon' not in config.registry.settings: favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'} # hard to test (in particular on travis) and without too much consequence # (and the consequences faced are easy to spot). if pkg_dir.joinpath('static', 'favicon.ico').exists(): # pragma: no cover favicon['clld.favicon'] = root_package + ':static/favicon.ico' config.add_settings(favicon) with open(abspath_from_asset_spec( config.registry.settings['clld.favicon']), mode='rb') as fp: fh = md5() fh.update(fp.read()) config.add_settings({'clld.favicon_hash': fh.hexdigest()}) translation_dirs = ['clld:locale'] if pkg_dir.joinpath('locale').exists(): translation_dirs.append('%s:locale' % root_package) # pragma: no cover config.add_translation_dirs(*translation_dirs) if pkg_dir.joinpath('static/publisher_logo.png').exists(): # pragma: no cover config.add_settings( {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package}) if asbool(config.registry.settings.get('clld.pacific_centered_maps')): geojson.pacific_centered() v = maybe_import('%s.views' % root_package) if v: config.scan(v) # pragma: no cover menuitems = config.registry.settings.get( 'clld.menuitems_list', ['contributions', 'parameters', 'languages', 'contributors']) config.register_menu(('dataset', dict(label='Home')), *menuitems) config.include('pyramid_mako') for name in ['adapters', 'datatables', 'maps']: mod = maybe_import('%s.%s' % (root_package, name)) if mod and hasattr(mod, 'includeme'): config.include(mod) config.register_download(CldfDownload(common.Dataset, root_package))
class Database(object): def __init__(self, fname): """ A `Database` instance is initialized with a file path. :param fname: Path to a file in the file system where the db is to be stored. """ self.fname = Path(fname) def drop(self): if self.fname.exists(): remove(self.fname) def connection(self): return closing(sqlite3.connect(self.fname.as_posix())) def create(self, force=False): """ Creates a db file with the core schema. :param force: If `True` an existing db file will be overwritten. """ if self.fname and self.fname.exists(): if force: self.drop() else: raise ValueError( 'db file already exists, use force=True to overwrite') with self.connection() as db: db.execute("""\ CREATE TABLE dataset ( ID INTEGER PRIMARY KEY NOT NULL, name TEXT, module TEXT, metadata_json TEXT )""") db.execute("""\ CREATE TABLE datasetmeta ( dataset_ID INT , key TEXT, value TEXT, FOREIGN KEY(dataset_ID) REFERENCES dataset(ID) )""") db.execute("""\ CREATE TABLE SourceTable ( dataset_ID INT , ID TEXT PRIMARY KEY NOT NULL, bibtex_type TEXT, {0} extra TEXT, FOREIGN KEY(dataset_ID) REFERENCES dataset(ID) )""".format('\n '.join('"{0}" TEXT,'.format(f) for f in BIBTEX_FIELDS))) def fetchone(self, sql, conn=None): return self._fetch(sql, 'fetchone', conn) def fetchall(self, sql, conn=None): return self._fetch(sql, 'fetchall', conn) def _fetch(self, sql, method, conn): def _do(conn, sql, method): cu = conn.cursor() cu.execute(sql) return getattr(cu, method)() if not conn: with self.connection() as conn: return _do(conn, sql, method) else: return _do(conn, sql, method) def delete(self, dataset_id): with self.connection() as db: for row in db.execute( "SELECT name FROM sqlite_master WHERE type='table'"): table = row[0] if table != 'dataset': db.execute( "DELETE FROM {0} WHERE dataset_ID = ?".format(table), (dataset_id, )) db.execute("DELETE FROM dataset WHERE ID = ?", (dataset_id, )) db.commit() def _create_table_if_not_exists(self, table): if table.name in [ r[0] for r in self.fetchall( "SELECT name FROM sqlite_master WHERE type='table'") ]: return False with self.connection() as conn: conn.execute(table.sql) return True def load(self, dataset): """ Load a CLDF dataset into the database. :param dataset: :return: """ tables, ref_tables = schema(dataset) # update the DB schema: for t in tables: if self._create_table_if_not_exists(t): continue db_cols = { r[1]: r[2] for r in self.fetchall("PRAGMA table_info({0})".format(t.name)) } for col in t.columns: if col.name not in db_cols: with self.connection() as conn: conn.execute( "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}" .format(t.name, col)) else: if db_cols[col.name] != col.db_type: raise ValueError( 'column {0}:{1} {2} redefined with new type {3}'. format(t.name, col.name, db_cols[col.name], col.db_type)) for t in ref_tables.values(): self._create_table_if_not_exists(t) # then load the data: with self.connection() as db: db.execute('PRAGMA foreign_keys = ON;') pk = max([ r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db) ] or [0]) + 1 insert(db, 'dataset', 'ID,name,module,metadata_json', (pk, '{0}'.format(dataset), dataset.module, dumps(dataset.metadata_dict))) insert( db, 'datasetmeta', 'dataset_ID,key,value', *[(pk, k, '{0}'.format(v)) for k, v in dataset.properties.items()]) # load sources: rows = [] for src in dataset.sources.items(): values = [pk, src.id, src.genre ] + [src.get(k) for k in BIBTEX_FIELDS] values.append( dumps({ k: v for k, v in src.items() if k not in BIBTEX_FIELDS })) rows.append(tuple(values)) insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'], *rows) # For regular tables, we extract and keep references to sources. refs = defaultdict(list) for t in tables: cols = {col.name: col for col in t.columns} ref_table = ref_tables.get(t.name) rows, keys = [], [] for row in dataset[t.name]: keys, values = ['dataset_ID'], [pk] for k, v in row.items(): if ref_table and k == ref_table.consumes: refs[ref_table.name].append( (row[t.primary_key], v)) else: col = cols[k] if isinstance(v, list): v = (col.separator or ';').join(col.convert(vv) for vv in v) else: v = col.convert(v) keys.append(k) values.append(v) rows.append(tuple(values)) insert(db, t.name, keys, *rows) # Now insert the references, i.e. the associations with sources: for tname, items in refs.items(): rows = [] for oid, sources in items: for source in sources: sid, context = Sources.parse(source) rows.append([pk, oid, sid, context]) oid_col = '{0}_ID'.format(tname.replace('Source', '')) insert(db, tname, [ 'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context' ], *rows) db.commit()
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p) language_url_pattern = self.route_url_pattern(req, 'language') with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: tables = [] for param in DBSession.query(Parameter).options(joinedload(Parameter.domain)): fname = '%s-%s.csv' % (req.dataset.id, param.id) zipfile.writestr(fname, self.get_values(param, language_url_pattern)) tables.append({ '@type': 'Table', 'url': fname, 'notes': [ { '@id': req.resource_url(param), 'dc:identifier': param.id, 'dc:title': param.name, 'dc:description': param.description or ''}] + [ { '@type': 'DomainElement', 'name': de.name, 'description': de.description, 'numeric': de.number } for de in param.domain ], }) md = CsvmJsonAdapter.csvm_basic_doc(req, tables=tables) md.update({ '@type': 'TableGroup', 'dc:language': list(self.get_languages(req, language_url_pattern)), 'tableSchema': { "columns": [ { "name": "ID", "datatype": "string", "required": True }, { "name": "Language_ID", "datatype": "string", "required": True }, { "name": "Parameter_ID", "datatype": "string", "required": True }, { "name": "Contribution_ID", "datatype": "string", "required": True }, { "name": "Value", "datatype": "string", "required": True }, { "name": "Source", "datatype": "string", }, { "name": "Comment", "datatype": "string", }, ], "primaryKey": "ID", 'aboutUrl': self.route_url_pattern(req, 'value', '{ID}'), }, }) zipfile.writestr( '%s.csv-metadata.json' % req.dataset.id, json.dumps(md, indent=4)) bib = Database([ rec.bibtex() for rec in DBSession.query(Source).order_by(Source.name)]) zipfile.writestr('%s.bib' % req.dataset.id, ('%s' % bib).encode('utf8')) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * ( len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)