def run(args): if args.citations: # pragma: no cover for dataset in args.repos.index: print('> {}\n'.format(dataset.citation)) return if args.datasets_only: with Table(args, 'dataset', 'files', 'size') as t: totalfiles, totalsize = 0, 0 for dataset in args.repos.index: totalfiles += len(dataset.files) totalsize += sum(f.size for f in dataset.files) t.append([ dataset.name, len(dataset.files), format_size(sum(f.size for f in dataset.files))]) t.append([ 'total: {} datasets'.format(len(args.repos.index)), totalfiles, format_size(totalsize)]) return if args.index: # pragma: no cover args.format = 'pipe' print("""# Content [georoc.sqlite.gz](georoc.sqlite.gz) contains data from [GEOROC's precompiled datasets](https://data.goettingen-research-online.de/dataverse/digis) as listed below. """) with Table(args, 'file', 'dataset', 'size', 'last modified') as t: if args.samples: t.columns.append('# samples') if args.references: t.columns.append('# references') t.columns.append('path') for ds in args.repos.index: if not args.dataset or (args.dataset in ds.name): for f in ds.files: row = [ '[{}]({})'.format(f.id, f.md['pidURL']), ds.name, format_size(f.size), f.date ] if args.samples: row.append(len(list(f.iter_samples(args.repos, stdout=None)))) if args.references: row.append(len(list(f.iter_references(args.repos)))) row.append(f.name) t.append(row)
def upload_sources(args): """ concepticon upload_sources path/to/cdstar/catalog """ toc = ['# Sources\n'] api = Concepticon(args.data) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog(args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted(api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc)
def linked_image(obj, check=True): if check and maintype(obj) != 'image': raise ValueError('type mismatch: {0} and image'.format(maintype(obj))) return HTML.a( HTML.img(src=bitstream_url(obj, 'web'), class_='image'), href=bitstream_url(obj), title="View image ({0})".format(format_size(obj.jsondata.get('size', 0))))
def format_file(f, with_mime_type=True): icon_ = { 'image': 'camera', 'video': 'facetime-video', }.get(f.mime_type.split('/')[0], 'file') if with_mime_type: label = f.mime_type + '; ' else: label = '' label = ' [%s%s]' % (label, misc.format_size(f.jsondata['size'])) return HTML.span(icon(icon_, inverted=True), label, class_='badge')
def file_link(file): url = file.jsondata.get('url', 'http://hdl.handle.net/' + file.id.replace('__', '/')) suffix = pathlib.Path(url.split('/')[-1]).suffix content = [ HTML.a( icon('file'), '{} ({})'.format( suffix[1:].upper() if suffix else 'PDF', format_size(file.jsondata['size'])), href=url, )] license = file.jsondata['license'] if license: content.append(' licensed under ') content.append(external_link( license['url'], label=license['id'].upper(), title=license['name'])) return HTML.span(*content)
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ["CDSTAR_CATALOG"] toc = ["# Sources\n"] api = Concepticon(args.repos) with SourcesCatalog(api.data_path("sources", "cdstar.json")) as lcat: with Catalog( catalog_path, cdstar_url=os.environ["CDSTAR_URL"], cdstar_user=os.environ["CDSTAR_USER"], cdstar_pwd=os.environ["CDSTAR_PWD"], ) as cat: for fname in sorted(api.data_path("sources").glob("*.pdf"), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list( cat.create(fname, {"collection": "concepticon"}))[0] lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append("- [{0} [PDF {1}]]({2})".format( key, format_size(spec["size"]), spec["url"])) readme(api.data_path("sources"), toc) print(catalog_path)
def link(obj, label=None, with_mime_type=True, badge=False): label = label or 'View file' mtype = mimetype(obj) icon_ = MIMETYPE_TO_ICON.get( mtype, MIMETYPE_TO_ICON.get(maintype(obj, mimetype_=mtype), 'download-alt')) md = '' if obj.jsondata.get('size'): md = format_size(obj.jsondata['size']) if with_mime_type: if md: md += ', ' md += mtype if md: label += ' (%s)' % md return HTML.a( HTML.span( icon(icon_, inverted=badge), ' ' + label, class_='badge' if badge else 'cdstar_link'), href=bitstream_url(obj))
def upload_sources(args): """ Compile sources and upload the result to GWDG CDSTAR instance. Notes ----- CDSTAR authorisation information should be supplied in the form of environment variables: - CDSTAR_URL - CDSTAR_USER - CDSTAR_PWD Examples -------- $ concepticon upload_sources path/to/cdstar/catalog """ catalog_path = args.args[0] if args.args else os.environ['CDSTAR_CATALOG'] toc = ['# Sources\n'] api = Concepticon(args.repos) with SourcesCatalog(api.data_path('sources', 'cdstar.json')) as lcat: with Catalog( catalog_path, cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for fname in sorted( api.data_path('sources').glob('*.pdf'), key=lambda f: f.stem): clid = as_unicode(fname.stem) spec = lcat.get(clid) if not spec: _, _, obj = list(cat.create(fname, {'collection': 'concepticon'}))[0] spec = lcat.add(clid, obj) for key in sorted(lcat.items): spec = lcat.get(key) toc.append('- [{0} [PDF {1}]]({2})'.format( key, format_size(spec['size']), spec['url'])) readme(api.data_path('sources'), toc) print(catalog_path)
def size_h(self): return format_size(getattr(self, 'size', 0))
def size(self, req): _path = self.abspath(req) if _path.exists(): return format_size(_path.stat().st_size)
def size_h(self): return format_size(self.size)
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def run(args): ds = get_dataset(args) ds_cldf = ds.cldf_reader() release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) if ds_cldf.get('media.csv', None) is None: # pragma: no cover args.log.error('Dataset has no media.csv') raise ParserError if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): args.log.error('Invalid passed DOI') raise ParserError if args.update_zenodo: if not release_dir.exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir)) raise ParserError if not (release_dir / ZENODO_FILE_NAME).exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir / ZENODO_FILE_NAME)) raise ParserError if args.create_release: args.log.error( 'You cannot create the release and update zenodo at the same time.' ) raise ParserError if args.create_release: if not args.parent_doi: args.log.error( 'The corresponding DOI is required (via --parent-doi).') raise ParserError mime_types = None if args.mimetype: mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.list: size = collections.Counter() number = collections.Counter() else: media_dir = args.out / MEDIA media_dir.mkdir(exist_ok=True) media = [] if not args.update_zenodo: used_file_extensions = set() with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: for i, row in enumerate( tqdm.tqdm([r for r in ds_cldf['media.csv']], desc='Getting {0} items'.format(MEDIA))): url = ds_cldf.get_row_url('media.csv', row) if isinstance(url, rfc3986.URIReference): url = url.normalize().unsplit() row['URL'] = url f_ext = url.split('.')[-1].lower() if args.debug and i > 500: break if (mime_types is None) or f_ext in mime_types\ or any(row['mimetype'].startswith(x) for x in mime_types): if args.list: m = '{0} ({1})'.format(row['mimetype'], f_ext) size[m] += int(row['size']) number.update([m]) else: used_file_extensions.add(f_ext.lower()) d = media_dir / row['ID'][:2] d.mkdir(exist_ok=True) fn = '.'.join([row['ID'], f_ext]) target = d / fn row['local_path'] = pathlib.Path(row['ID'][:2]) / fn if i == 0: w.writerow(row) w.writerow(row.values()) media.append(target) if (not target.exists()) or md5(target) != row['ID']: _create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) return # Waiting for the download threads to finish if 'download_threads' in globals(): for t in download_threads: t.join() if args.create_release: assert media_dir.exists(), 'No folder "{0}" found in {1}'.format( MEDIA, media_dir.resolve()) release_dir.mkdir(exist_ok=True) media.append(media_dir / INDEX_CSV) try: zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) fp = args.out for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) zipf.close() except Exception as e: args.log.error(e) raise def _contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } version_v = git_describe('.').split('-')[0] version = version_v.replace('v', '') git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') with jsonlib.update(release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: contribs = ds.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [_contrib(p) for p in creators] if contributors: md['contributors'] = [_contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] + \ COMMUNITIES if communities and not args.debug: md['communities'] = [{ "identifier": community_id } for community_id in sorted(set(communities))] md.update({ 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'dataset', 'publication_date': datetime.today().strftime('%Y-%m-%d'), 'version': version, 'related_identifiers': [ { 'scheme': 'url', 'identifier': '{0}/tree/{1}'.format(git_url, version_v), 'relation': 'isSupplementTo' }, ], }) if args.parent_doi: md['related_identifiers'].append({ 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf' }) supplement_to = " - Supplement to dataset " \ "<a href='https://doi.org/{0}'>{1}</a> ".format( args.parent_doi, ds.metadata.title) # noqa: E122 if ds.metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': ds.metadata.url, 'relation': 'isAlternateIdentifier' }) formats = ', '.join(sorted(used_file_extensions)) descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else '' online_url, online = '', '' if ds.metadata.url: online_url = ds.metadata.url online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format( online_url) md['description'] = html.escape( DESCRIPTION.format( url=online_url, formats=' ({0})'.format(formats) if formats else '', title=md['title'], supplement_to=supplement_to, descr=descr, online=online)) license_md = '' if ds.metadata.zenodo_license: md['license'] = {'id': ds.metadata.zenodo_license} license_md = LICENCE.format(ds.metadata.zenodo_license) DataDir(release_dir).write( 'README.md', README.format( title=md['title'], doi='https://doi.org/{0}'.format(args.parent_doi), ds_title=ds.metadata.title, license=license_md, formats=' ({0})'.format(formats) if formats else '', media=MEDIA, index=INDEX_CSV)) if args.update_zenodo: md = {} md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME)) if args.debug: api_url = API_URL_SANDBOX access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') else: api_url = API_URL access_token = ACCESS_TOKEN zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=access_token) try: rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) except Exception as e: args.log.error( 'Check connection and credentials for accessing Zenodo.\n{0}'. format(e)) return latest_version = rec.links['latest'].split('/')[-1] if latest_version != args.update_zenodo: args.log.warn( 'Passed deposit ID does not refer to latest version {0}!'. format(latest_version)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Version: ' + rec.metadata.version) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def test_format_size(): from clldutils.misc import format_size for i in range(10): assert format_size(1000 ** i)
def bitstream_link(oid, spec): url = cdstar.SERVICE_URL.path('/bitstreams/{0}/{1}'.format( oid, spec['bitstreamid'])).as_string() return HTML.a('{0} [{1}]'.format(spec['bitstreamid'], format_size(spec['filesize'])), href=url)
def bitstream_link(oid, spec): url = SERVICE_URL.path( '{0}/{1}'.format(oid, spec['bitstreamid'])).as_string() return HTML.a( '{0} [{1}]'.format(spec['bitstreamid'], format_size(spec['filesize'])), href=url)
def test_format_size(): from clldutils.misc import format_size for i in range(10): assert format_size(1000**i)
def format_size(f): return misc.format_size(f.jsondata['size'])