def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples} md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath( 'static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env['request'].dataset dump_dir = args.data_file('dumps') if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp: fp.write(freeze_readme(dataset, args.env['request'])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath('%s.csv' % table.name) if with_history or not table.name.endswith('_history'): _freeze(table, csv) if csv.exists(): csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc( csvm, args.env['request'], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version # pragma: no cover jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile( as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open('rb') as fp: zipfile.writestr(f.name, fp.read())
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env['request'].dataset dump_dir = args.data_file('dumps') if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp: fp.write(freeze_readme(dataset, args.env['request'])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath('%s.csv' % table.name) if with_history or not table.name.endswith('_history'): _freeze(table, csv) if csv.exists(): csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc(csvm, args.env['request'], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile(as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open('rb') as fp: zipfile.writestr(f.name, fp.read())
def freeze_func(args, dataset=None, with_history=True): dataset = dataset or args.env["request"].dataset dump_dir = args.data_file("dumps") if not dump_dir.exists(): dump_dir.mkdir() dump_dir = dump_dir.resolve() with dump_dir.joinpath("README.txt").open("w", encoding="utf8") as fp: fp.write(freeze_readme(dataset, args.env["request"])) db_version = get_alembic_version(DBSession) for table in Base.metadata.sorted_tables: csv = dump_dir.joinpath("%s.csv" % table.name) if with_history or not table.name.endswith("_history"): _freeze(table, csv) if csv.exists(): csvm = "%s.%s" % (table.name, CsvmJsonAdapter.extension) doc = CsvmJsonAdapter.csvm_doc(csvm, args.env["request"], [(col.name, col) for col in table.columns]) if db_version: # We (ab)use a dc:identifier property to pass the alembic revision of the # database to the unfreeze script. doc["dc:identifier"] = db_version jsonlib.dump(doc, dump_dir.joinpath(csvm)) with ZipFile(as_posix(args.data_file("..", "data.zip")), "w", ZIP_DEFLATED) as zipfile: for f in dump_dir.iterdir(): if f.is_file(): with f.open("rb") as fp: zipfile.writestr(f.name, fp.read())
def rmtree(d, **kw): d = as_posix(d) for path in (os.path.join(d, f) for f in os.listdir(d)): if os.path.isdir(path): rmtree(path) else: os.unlink(path) os.rmdir(d)
def download_and_unpack_zipfiles(url, dataset, *paths): """Download zipfiles and immediately unpack the content""" with TemporaryDirectory() as tmpdir: urlretrieve(url, tmpdir.joinpath('ds.zip').as_posix()) with zipfile.ZipFile(tmpdir.joinpath('ds.zip').as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), dataset.raw)
def iterentries(filename, encoding=None): encoding = encoding or 'utf8' with memorymapped(as_posix(filename)) as source: try: for entrytype, (bibkey, fields) in iterentries_from_text(source, encoding): yield entrytype, (bibkey, fields) except PybtexSyntaxError as e: # pragma: no cover debug_pybtex(source, e)
def rmtree(d, **kw): """More performant way to remove large directory structures.""" d = as_posix(d) for path in (os.path.join(d, f) for f in os.listdir(d)): if os.path.isdir(path): rmtree(path) else: os.unlink(path) os.rmdir(d)
def load(path, **kw): """python 2 + 3 compatible version of json.load. :param kw: Keyword parameters are passed to json.load :return: The python object read from path. """ _kw = {} if PY3: # pragma: no cover _kw['encoding'] = 'utf8' with open(as_posix(path), **_kw) as fp: return json.load(fp, **kw)
def load(path, **kw): """python 2 + 3 compatible version of json.load. :param kw: Keyword parameters are passed to json.load :return: The python object read from path. """ _kw = {} if PY3: # pragma: no cover _kw['encoding'] = 'utf-8' with open(as_posix(path), **_kw) as fp: return json.load(fp, **kw)
def unfreeze_func(args, engine=None): try: importlib.import_module(args.module.__name__) except ImportError: pass # pragma: no cover engine = engine or DBSession.get_bind() data_dir = Path(mkdtemp()) with ZipFile(as_posix(args.module_dir.joinpath('..', 'data.zip'))) as fp: fp.extractall(as_posix(data_dir)) db_version = None for table in Base.metadata.sorted_tables: csv = data_dir.joinpath('%s.csv' % table.name) if csv.exists(): db_version = load(table, csv, engine) if db_version: set_alembic_version(engine, db_version) # pragma: no cover rmtree(data_dir)
def unfreeze_func(args, engine=None): try: importlib.import_module(args.module.__name__) except ImportError: pass # pragma: no cover engine = engine or DBSession.get_bind() data_dir = Path(mkdtemp()) with ZipFile(as_posix(args.module_dir.joinpath('..', 'data.zip'))) as fp: fp.extractall(as_posix(data_dir)) db_version = None for table in Base.metadata.sorted_tables: csv = data_dir.joinpath('%s.csv' % table.name) if csv.exists(): db_version = load(table, csv, engine) if db_version: set_alembic_version(engine, db_version) rmtree(data_dir)
def dump(obj, path, **kw): """python 2 + 3 compatible version of json.dump. :param obj: The object to be dumped. :param path: The path of the JSON file to be written. :param kw: Keyword parameters are passed to json.dump """ _kw = dict(mode='w') if PY3: # pragma: no cover _kw['encoding'] = 'utf8' with open(as_posix(path), **_kw) as fp: return json.dump(obj, fp, **kw)
def llod_func(args): # pragma: no cover """Create an RDF dump and compute some statistics about it.""" tmp = Path(mkdtemp()) count_rsc = 0 count_triples = 0 tmp_dump = tmp.joinpath('rdf.n3') with open(as_posix(tmp_dump), 'w') as fp: for rsc in RESOURCES: args.log.info('Resource type %s ...' % rsc.name) try: q = DBSession.query(rsc.model) except InvalidRequestError: args.log.info('... skipping') continue for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True): graph = get_graph(obj, args.env['request'], rsc.name) count_triples += len(graph) count_rsc += 1 fp.write(n3(graph, with_head=count_rsc == 1)) args.log.info('... finished') # put in args.data_file('..', 'static', 'download')? md = { 'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples } md.update(count_links(as_posix(tmp_dump))) jsonlib.dump(md, args.data_file('rdf-metadata.json')) print(md) dataset = Dataset.first() rdf_dump = args.module_dir.joinpath('static', 'download', '%s-dataset.n3' % dataset.id) tmp_dump.copy(rdf_dump) check_call('gzip -f %s' % rdf_dump, shell=True) print(str(rdf_dump))
def write(self, filename, encoding='utf-8'): """Write the list of entries to a file. :param filename: :param encoding: :return: """ if isinstance(filename, Path): filename = as_posix(filename) with io.open(filename, 'w', encoding=encoding) as fp: for entry in self: fp.write(entry.__unicode__()) fp.write('\n\n')
def xls2csv(fname, outdir=None): res = {} outdir = outdir or fname.parent wb = xlrd.open_workbook(as_posix(fname)) for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) if sheet.nrows: path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with UnicodeWriter(path) as writer: for i in range(sheet.nrows): writer.writerow([col.value for col in sheet.row(i)]) res[sname] = path return res
def languoids(self, ids=None, maxlevel=models.Level.dialect): nodes = {} for dirpath, dirnames, filenames in os.walk(as_posix(self.tree)): dp = Path(dirpath) if dp.name in nodes and nodes[dp.name][2] > maxlevel: del dirnames[:] for dirname in dirnames: if ids is None or dirname in ids: lang = languoids.Languoid.from_dir(dp.joinpath(dirname), nodes=nodes) if lang.level <= maxlevel: yield lang
def parse(filename, encoding, entry_sep, entry_prefix): if isinstance(filename, Path): filename = as_posix(filename) # we cannot use codecs.open, because it does not understand mode U. with open(filename, 'rU', encoding=encoding) as fp: content = fp.read() for block in content.split(entry_sep): if block.strip(): block = entry_prefix + block else: continue # pragma: no cover yield [(k, v.strip()) for k, v in marker_split(block.strip()) if v.strip()]
def write(self, filename, encoding='utf8'): """ Write the list of entries to a file. :param filename: :param encoding: :return: """ if isinstance(filename, Path): filename = as_posix(filename) with open(filename, 'w', encoding=encoding) as fp: for entry in self: fp.write(entry.__unicode__()) fp.write('\n\n')
def parse(filename, encoding, entry_sep, entry_prefix, keep_empty=False): if isinstance(filename, Path): filename = as_posix(filename) with io.open(filename, 'r', encoding=encoding, newline=None) as fp: content = fp.read() for block in content.split(entry_sep): if block.strip(): block = entry_prefix + block else: continue # pragma: no cover yield [(k, v.strip()) for k, v in marker_split(block.strip()) if v.strip() or keep_empty]
def parse(filename, encoding, entry_sep, entry_prefix, keep_empty=False): if isinstance(filename, Path): filename = as_posix(filename) # we cannot use codecs.open, because it does not understand mode U. with io.open(filename, 'rU', encoding=encoding) as fp: content = fp.read() for block in content.split(entry_sep): if block.strip(): block = entry_prefix + block else: continue # pragma: no cover yield [(k, v.strip()) for k, v in marker_split(block.strip()) if v.strip() or keep_empty]
def download_and_unpack(self, url, *paths, **kw): """ Download a zipfile and immediately unpack selected content. :param url: :param paths: :param kw: :return: """ with self.temp_download(url, 'ds.zip', log=kw.pop('log', None)) as zipp: with TemporaryDirectory() as tmpdir: with zipfile.ZipFile(zipp.as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), self)
def dump(obj, path, **kw): """Python 2 + 3 compatible version of json.dump. :param obj: The object to be dumped. :param path: The path of the JSON file to be written. :param kw: Keyword parameters are passed to json.dump """ open_kw = {'mode': 'w'} if PY3: # pragma: no cover open_kw['encoding'] = 'utf-8' # avoid indented lines ending with ", " on PY2 if kw.get('indent') and kw.get('separators') is None: kw['separators'] = (',', ': ') with open(as_posix(path), **open_kw) as fp: return json.dump(obj, fp, **kw)
def read_comments(filename): parser = CommentParser.get_parser() with open(as_posix(filename), "rb") as fp: parser.feed(fp.read()) return [e for e in parser.close() if e.tag == ET.Comment]
def save(entries, filename, sortkey, encoding='utf8'): with io.open(as_posix(filename), 'w', encoding=encoding, errors='strict') as fd: dump(entries, fd, sortkey, encoding, None)
def __init__(self, fname, mode='r'): ZipFile.__init__( self, as_posix(fname), mode=mode, compression=ZIP_DEFLATED, allowZip64=True)
def __init__(self, fname, mode='r', **kwargs): for k, v in iteritems(self._init_defaults): kwargs.setdefault(k, v) super(ZipArchive, self).__init__(as_posix(fname), mode=mode, **kwargs)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == "cleanup": for fname in args.data_file("gbs").glob("*.json"): try: data = jsonlib.load(fname) if data.get("totalItems") == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file("gbs", "source%s.json" % source.id) if command == "update": source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ["verify", "update"]: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn("no JSON object found in: %s" % filepath) continue if not data["totalItems"]: continue item = data["items"][0] else: continue if command == "verify": stitle = source.description or source.title or source.booktitle needs_check = False year = item["volumeInfo"].get("publishedDate", "").split("-")[0] if not year or year != slug(source.year or ""): needs_check = True twords = words(stitle) iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", "")) if ( twords == iwords or (len(iwords) > 2 and iwords.issubset(twords)) or (len(twords) > 2 and twords.issubset(iwords)) ): needs_check = False if int(source.id) == 241: log.info("%s" % sorted(words(stitle))) log.info("%s" % sorted(iwords)) if needs_check: log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers"))) log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", ""))) log.info(stitle) log.info(item["volumeInfo"].get("publishedDate")) log.info(source.year) log.info(item["volumeInfo"].get("authors")) log.info(source.author) log.info(item["volumeInfo"].get("publisher")) log.info(source.publisher) if not confirm("Are the records the same?"): log.warn("---- removing ----") jsonlib.dump({"totalItems": 0}, filepath) elif command == "update": source.google_book_search_id = item["id"] source.update_jsondata(gbs=item) count += 1 elif command == "download": if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ "inauthor:" + quote_plus(source.author.encode("utf8")), "intitle:" + quote_plus(title.encode("utf8")), ] if source.publisher: q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8"))) url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key) count += 1 r = requests.get(url, headers={"accept": "application/json"}) log.info("%s - %s" % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), "w") as fp: fp.write(r.text.encode("utf8")) elif r.status_code == 403: log.warn("limit reached") break if command == "update": log.info("assigned gbs ids for %s out of %s sources" % (count, i)) elif command == "download": log.info("queried gbs for %s sources" % count)
def save(entries, filename, sortkey, encoding='utf-8', normalize='NFC'): with io.open(as_posix(filename), 'w', encoding=encoding, errors='strict') as fd: dump(entries, fd, sortkey, normalize)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words( item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info('------- %s -> %s' % ( source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % ( item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus( source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def test_as_posix(): from clldutils.path import as_posix, Path with pytest.raises(ValueError): as_posix(5) assert as_posix('.') == as_posix(Path('.'))
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info( '------- %s -> %s' % (source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % (item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus(source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def check(filename, encoding=None): parser = CheckParser(encoding=encoding) parser.parse_file(as_posix(filename)) return parser.error_count
def test_as_posix(self): from clldutils.path import as_posix, Path self.assertRaises(ValueError, as_posix, 5) self.assertEquals(as_posix('.'), as_posix(Path('.')))