def test_ImageProvider_retrieve(self): from pytsammalex.image_providers.base import ImageProvider repos = create_repos(self.tmp_path()) fname = self.tmp_path('test') media = self.tmp_path(('media.json')) with fname.open('w', encoding='utf8') as fp: fp.write('test') class P(ImageProvider): def __init__(self, md): self._md = md ImageProvider.__init__(self, repos) def metadata(self, item): return self._md self.assertIsNone(P({}).retrieve(None, None, None, None)) staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'}) prov = P({'source_url': fname}) with self.assertRaises(ValueError): prov.retrieve(staged_image, None, [md5(fname)], None) cdstar = MagicMock( create=MagicMock(return_value=[(None, None, MOCK_CDSTAR_OBJECT)])) prov = P({'source_url': 'x'}) with patch('pytsammalex.util.requests', MockRequests()): with MediaCatalog(media.name, repos=repos) as mcat: prov.retrieve(staged_image, cdstar, [], mcat) self.assertTrue(cdstar.create.called) self.assertEqual(len(MediaCatalog(media.name, repos=repos)), 1)
def _download_sql_dump(rel, log): target = Path('glottolog-{0}.sql.gz'.format(rel['version'])) log.info('retrieving {0}'.format(rel['sql_dump_url'])) urlretrieve(rel['sql_dump_url'], target.as_posix()) assert md5(target) == rel['sql_dump_md5'] unpacked = target.with_suffix('') with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u: shutil.copyfileobj(f, u) target.unlink() log.info('SQL dump for Glottolog release {0} written to {1}'.format( rel['version'], unpacked))
def download_sql_dump(self, log): target = self.dump_fname(zipped=True) log.info('retrieving {0}'.format(self.sql_dump_url)) urlretrieve(self.sql_dump_url, str(target)) assert md5(target) == self.sql_dump_md5 unpacked = target.with_suffix('') with gzip.open(str(target)) as f, unpacked.open('wb') as u: shutil.copyfileobj(f, u) target.unlink() log.info('SQL dump for Glottolog release {0} written to {1}'.format( self.version, unpacked))
def _upload(self, sfn, files): """ Upload a files for SoundfileName sfn. """ print(sfn) # Lookup the SoundfileName in catalog: cat_obj = self[sfn] if sfn in self else None # Retrieve or create the corresponding CDSTAR object: obj = self.api.get_object(cat_obj.id if cat_obj else None) print(obj.id) md = {'collection': 'soundcomparisons', 'name': sfn, 'type': 'soundfile'} changed = False if not cat_obj: # If the object is already in the catalog, the metadata does not change! obj.metadata = md for f in files: fmt = f.suffix[1:] if fmt not in self.mimetypes: continue create = True if cat_obj: for cat_bitstream in cat_obj.bitstreams: if cat_bitstream.id.endswith(f.suffix): # A bitstream for this mimetype already exists! if cat_bitstream.md5 == md5(f): # If the md5 sum is the same, don't bother uploading! create = False else: # Otherwise we have to delete the old bitstream before uploading the # new one. for bs in obj.bitstreams: if bs.id == cat_bitstream.id: bs.delete() break break if create: changed = True print('uploading {0}'.format(f.name)) obj.add_bitstream(fname=str(f), name=f.name, mimetype=self.mimetypes[fmt]) time.sleep(0.1) else: print('skipping {0}'.format(f.name)) if changed: obj.read() self.add(obj, metadata=md, update=True)
def retrieve(self, item, cdstar_catalog, checksums, mediacatalog): """ - download - compute checksum - upload to CDSTAR - add to cdstar.json :return: Image instance """ md = self.metadata(item) or {} source_url = md.pop('source_url', None) if not source_url: return # We turn the Staged_images instance into a `dict`, which we will enrich and then # turn into an Images instance. item = dict(zip(item.fields(), item.csv_row())) with TemporaryDirectory() as tmp: if isinstance(source_url, Path): fname = tmp.joinpath(source_url.name) copy(source_url, fname) else: # download the thing fname = self._download(source_url, tmp) if not fname: return checksum = md5(fname) if checksum in checksums: raise ValueError('duplicate item {0} {1}'.format(item['id'], checksum)) item.update(md) item['id'] = checksum item['collection'] = 'Tsammalex' img = Images.fromdict(item) if checksum not in mediacatalog.items: # now upload to CDSTAR _, _, obj = list(cdstar_catalog.create(fname, item))[0] mediacatalog.add(obj) return img
def test_image_provider_retrieve(tmpdir): repos = create_repos(tmpdir) fname = tmpdir.join('test') with fname.open('w', encoding='utf8') as fp: fp.write('test') class TestProvider(ImageProvider): def identify(self, name): pass def __init__(self, md): self._md = md ImageProvider.__init__(self, repos) def metadata(self, item): return self._md assert (TestProvider({}).retrieve(None, None, None, None) is None) staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'}) prov = TestProvider({'source_url': fname}) with pytest.raises(ValueError): prov.retrieve(staged_image, None, [md5(fname)], None) cdstar = MagicMock( create=MagicMock(return_value=[(None, None, MOCK_CDSTAR_OBJECT)])) prov = TestProvider({'source_url': 'x'}) with patch('pytsammalex.util.requests', MockRequests()): with MediaCatalog('media.json', repos=Path(repos)) as mcat: prov.retrieve(staged_image, cdstar, [], mcat) assert (cdstar.create.called is True) assert (len(MediaCatalog('media.json', repos=Path(repos))) == 1)
def test_image_provider_retrieve(tmpdir): repos = create_repos(tmpdir) fname = tmpdir.join('test') with fname.open('w', encoding='utf8') as fp: fp.write('test') class TestProvider(ImageProvider): def identify(self, name): pass def __init__(self, md): self._md = md ImageProvider.__init__(self, repos) def metadata(self, item): return self._md assert (TestProvider({}).retrieve(None, None, None, None) is None) staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'}) prov = TestProvider({'source_url': fname}) with pytest.raises(ValueError): prov.retrieve(staged_image, None, [md5(fname)], None) cdstar = MagicMock(create=MagicMock(return_value=[(None, None, MOCK_CDSTAR_OBJECT)])) prov = TestProvider({'source_url': 'x'}) with patch('pytsammalex.util.requests', MockRequests()): with MediaCatalog('media.json', repos=Path(repos)) as mcat: prov.retrieve(staged_image, cdstar, [], mcat) assert (cdstar.create.called is True) assert (len(MediaCatalog('media.json', repos=Path(repos))) == 1)
def includeme(config): """Upgrading: - register utilities "by hand", after config.include('clld.web.app') - add routes by hand (and remove these from the **kw passed to Configurator) :param config: :return: """ # # now we exploit the default package layout as created via the CLLD scaffold: # # note: the following exploits the import time side effect of modifying the webassets # environment! root_package = config.root_package.__name__ pkg_dir = Path(config.root_package.__file__).parent.resolve() maybe_import('%s.assets' % root_package, pkg_dir=pkg_dir) json_renderer = JSON() json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('json', json_renderer) jsonp_renderer = JSONP(param_name='callback') jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('jsonp', jsonp_renderer) config.set_request_factory(ClldRequest) config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery) config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig) config.registry.registerUtility(CldfConfig(), interfaces.ICldfConfig) # initialize the db connection engine = engine_from_config(config.registry.settings, 'sqlalchemy.') DBSession.configure(bind=engine) Base.metadata.bind = engine try: git_tag = git_describe(Path(pkg_dir).parent) except ValueError: # pragma: no cover git_tag = None config.add_settings({ 'pyramid.default_locale_name': 'en', 'clld.pkg': root_package, 'clld.git_tag': git_tag, 'clld.parameters': {} }) if 'clld.files' in config.registry.settings: # deployment-specific location of static data files abspath = Path(config.registry.settings['clld.files']).resolve() config.add_settings({'clld.files': abspath}) config.add_static_view('files', str(abspath)) # event subscribers: config.add_subscriber(add_localizer, events.NewRequest) config.add_subscriber(init_map, events.ContextFound) config.add_subscriber( partial(add_renderer_globals, maybe_import('%s.util' % root_package, pkg_dir=pkg_dir)), events.BeforeRender) # # make it easy to register custom functionality # for name, func in { 'register_utility': register_utility, 'register_datatable': partial(register_cls, interfaces.IDataTable), 'register_map': partial(register_cls, interfaces.IMap), 'register_menu': register_menu, 'register_resource': register_resource, 'register_adapter': register_adapter, 'register_adapters': register_adapters, 'register_download': register_download, 'register_staticresource': register_staticresource, 'add_route_and_view': add_route_and_view, 'add_settings_from_file': add_settings_from_file, 'add_301': add_301, 'add_410': add_410, 'add_page': add_page, 'register_resource_routes_and_views': register_resource_routes_and_views, }.items(): config.add_directive(name, func) # # routes and views # config.add_static_view('clld-static', 'clld:web/static') config.add_static_view('static', '%s:static' % root_package) config.add_route_and_view('_js', '/_js', js, http_cache=3600) # add some maintenance hatches config.add_route_and_view('_raise', '/_raise', _raise) config.add_route_and_view('_ping', '/_ping', _ping, renderer='json') # sitemap support: config.add_route_and_view('robots', '/robots.txt', robots) config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex) config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap) config.add_route('resourcemap', '/resourcemap.json') config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp') config.add_route_and_view('select_combination', '/_select_combination', select_combination) config.add_route_and_view('unapi', '/unapi', unapi) config.add_route_and_view('olac', '/olac', olac) config.add_settings_from_file(pkg_dir.joinpath('appconf.ini')) if not config.registry.settings.get('mako.directories'): config.add_settings({'mako.directories': ['clld:web/templates']}) for rsc in RESOURCES: config.register_resource_routes_and_views(rsc) config.register_datatable( rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable)) register_resource_adapters(config, rsc) # maps config.register_map('languages', Map) config.register_map('language', LanguageMap) config.register_map('parameter', ParameterMap) config.register_map('combination', CombinationMap) config.include('clld.web.adapters') for icon in ICONS: config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name) config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList) config.registry.registerUtility(MapMarker(), interfaces.IMapMarker) # # inspect default locations for views and templates: # home_comp = OrderedDict() for name, template in [ ('introduction', False), ('about', False), ('terms', False), ('glossary', False), ('history', False), ('changes', False), ('credits', False), ('legal', True), ('download', True), ('contact', True), ('help', False), ]: home_comp[name] = template if pkg_dir.joinpath('templates').exists(): for p in pkg_dir.joinpath('templates').iterdir(): if p.stem in home_comp and p.suffix == '.mako': home_comp[p.stem] = True for name, template in home_comp.items(): if template: config.add_page(name) config.add_settings( {'home_comp': [k for k in home_comp.keys() if home_comp[k]]}) if 'clld.favicon' not in config.registry.settings: favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'} # hard to test (in particular on travis) and without too much consequence # (and the consequences faced are easy to spot). if pkg_dir.joinpath('static', 'favicon.ico').exists(): # pragma: no cover favicon['clld.favicon'] = root_package + ':static/favicon.ico' config.add_settings(favicon) config.add_settings({ 'clld.favicon_hash': md5(abspath_from_asset_spec(config.registry.settings['clld.favicon'])) }) translation_dirs = ['clld:locale'] if pkg_dir.joinpath('locale').exists(): translation_dirs.append('%s:locale' % root_package) # pragma: no cover config.add_translation_dirs(*translation_dirs) if pkg_dir.joinpath( 'static/publisher_logo.png').exists(): # pragma: no cover config.add_settings({ 'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package }) if asbool(config.registry.settings.get('clld.pacific_centered_maps')): geojson.pacific_centered() v = maybe_import('%s.views' % root_package, pkg_dir=pkg_dir) if v: config.scan(v) # pragma: no cover menuitems = config.registry.settings.get( 'clld.menuitems_list', ['contributions', 'parameters', 'languages', 'contributors']) config.register_menu(('dataset', dict(label='Home')), *menuitems) config.include('pyramid_mako') for name in ['adapters', 'datatables', 'maps']: mod = maybe_import('%s.%s' % (root_package, name), pkg_dir=pkg_dir) if mod and hasattr(mod, 'includeme'): config.include(mod) config.register_download(CldfDownload(common.Dataset, root_package))
def downloadSoundFiles(args, out_path=os.path.join(os.getcwd(), "sound"), db_needed=False): """ Downloads desired sound files as {sound/}FilePathPart/FilePathPart_WordID.EXT from CDSTAR to {current_folder}/sound or to out_path if passed. As default it downloads all stored sound files, with the argument {EXT} you can pass desired sound file extensions Usage: --sc-repo {--db-host --db-name --db-user --db-password} downloadSoundFiles ITEM {EXT} Valid ITEMs: UID(s): EAEA0-3A11-8354-556E-0 EAEA0-303B-3625-4014-0 ... Study Name(s): Brazil Europe ... FilePathPart(s): Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl ... FilePathPart(s)+Word: Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl_909_praised_maalato ... FilePathPart(s)+Word.EXT: Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl_909_praised_maalato.mp3 ... Language_Index: 11121250509 11131000008 ... Valid EXTs: mp3 ogg wav (if an extension is not stored it falls back to the first ext mentioned in catalog, otherwise no sound file) db_needed = False if all items can be calculated as keys of catalog.json like FilePathPart {+ WordID} """ if 'db_needed' in args.args: db_needed = True if db_needed: db = _db(args) catalog = _get_catalog(args, 'soundfiles') # holds all desired FilePathParts+WordIDs desired_keys = set() # get desired extensions valid_ext = catalog.mimetypes.keys() desired_ext = list(set(args.args) & set(valid_ext)) if len(desired_ext) == 0: desired_ext = list(valid_ext) else: # remove ext from args.args args.args = list(set(args.args) - set(valid_ext)) if db_needed: # get desired keys via study names try: valid_studies = _get_all_study_names(db) desired_studies = list(set(args.args) & set(valid_studies)) if len(desired_studies) > 0: # remove study names from args.args args.args = list(set(args.args) - set(desired_studies)) q = " UNION ".join([ "SELECT DISTINCT FilePathPart AS f FROM Languages_%s" % (s) for s in desired_studies]) for x in list(db(q)): new_keys = [ SoundfileName(k) for k in catalog.get_soundfilenames(x['f'])] if len(new_keys) == 0: args.log.warning( "Nothing found for %s in catalog - will be ignored" % ( x['f'])) desired_keys.update(new_keys) except ValueError as e: args.log.warning(e) except Exception as e: args.log.error("Check DB settings!") args.log.error(e) return # mapping LanguageIx -> FilePathPart if len(args.args) > 0: q = " UNION ".join([ """SELECT DISTINCT FilePathPart AS f, LanguageIx AS i FROM Languages_%s""" % (s) for s in valid_studies]) try: idx_map = {str(x['i']): x['f'] for x in list(db(q))} except Exception as e: args.log.error("Check DB settings!") args.log.error(e) return # parse LanguageIxs for i in args.args: if re.match(r"^\d{11,}$", i): # remove found LanguageIx from args.args args.args = list(set(args.args) - set([i])) if i in idx_map.keys(): # LanguageIx ? new_keys = [ SoundfileName(k) for k in catalog.get_soundfilenames(idx_map[i])] if len(new_keys) == 0: args.log.warning( "No sounds for LanguageIx %s (%s) - will be ignored" % ( i, idx_map[i])) desired_keys.update(new_keys) else: args.log.warning("LanguageIx %s unknown - will be ignored" % (i)) for i in args.args: if i in catalog: # UID or SoundfileName? try: #SoundfileName desired_keys.add(SoundfileName(i)) except ValueError: # UID try: desired_keys.add(SoundfileName(catalog[i].metadata['name'])) except ValueError: args.log.warning('Path for {0} is not valid - will be skipped'.format(i)) else: desired_keys.update(SoundfileName(k) for k in catalog.get_soundfilenames(i)) args.log.info('{0} sound files selected'.format(len(desired_keys))) out_path = Path(out_path) if not out_path.exists(): out_path.mkdir() desired_mimetypes = [catalog.mimetypes[ext] for ext in desired_ext] # pb = tqdm(total=len(desired_keys)) for folder, sfns in groupby(sorted(desired_keys), lambda s: s.variety): args.log.info(' ... {0}'.format(folder)) folder = out_path / folder if not folder.exists(): try: folder.mkdir() except Exception as e: try: folder.mkdir() except Exception as e: args.log.warning(' ... cannot make folder {0}'.format(folder)) continue for obj in [catalog[sfn] for sfn in sfns]: # pb.update() for bs in catalog.matching_bitstreams(obj, mimetypes=desired_mimetypes): target = folder / bs.id if (not target.exists()) or md5(target) != bs.md5: try: urlretrieve(catalog.bitstream_url(obj, bs), str(target)) except Exception as e: try: urlretrieve(catalog.bitstream_url(obj, bs), str(target)) except Exception as e: args.log.warning(' ... ... {0} should be checked'.format(obj.metadata['name']))
def run(args): ds = Dataset().cldf_reader() release_dir = args.out / '{0}_audio'.format(Dataset().id) zenodo_file_name = 'zenodo.json' if args.list: size = collections.Counter() number = collections.Counter() else: f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']} audio = args.out / 'audio' audio.mkdir(exist_ok=True) if not args.update_zenodo: for row in tqdm.tqdm([r for r in ds['media.csv']]): if args.list: size[row['mimetype']] += int(row['size']) number.update([row['mimetype']]) else: d = audio / f2c[row['Form_ID']] d.mkdir(exist_ok=True) url = ds.get_row_url('media.csv', row) target = d / '{}.{}'.format(row['ID'], url.split('.')[-1]) if (not target.exists()) or md5(target) != row['ID']: if (args.mimetype is None) or target.suffix.endswith( args.mimetype): create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k, str(number[k]), format_size(v)])) if args.create_release: assert audio.exists(), 'No folder "audio" found in {0}'.format( audio.resolve()) release_dir.mkdir(exist_ok=True) args.log.info('creating audio ZIP archive per parameter folder ...') try: zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w', zipfile.ZIP_DEFLATED) fp = args.out for root, dirs, files in tqdm.tqdm(os.walk(audio)): for f in files: if not f.startswith('.') and not f.startswith('__')\ and ((args.mimetype is None) or f.endswith(args.mimetype)): zipf.write(os.path.join(root, f), os.path.relpath(os.path.join(root, f), fp)) zipf.close() except Exception as e: args.log.error(e) raise def contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } with jsonlib.update(release_dir / zenodo_file_name, indent=4, default=collections.OrderedDict()) as md: contribs = Dataset().dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [contrib(p) for p in creators] if contributors: md['contributors'] = [contrib(p) for p in contributors] if COMMUNITIES: md['communities'] = [{ 'id': community_id } for community_id in COMMUNITIES] md.update({ 'title': '{0} Audio Files'.format(Dataset().metadata.title), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'video', 'version': VERSION, 'related_identifiers': [ { 'scheme': 'doi', 'identifier': '10.5281/zenodo.4309141', 'relation': 'isPartOf' }, { 'scheme': 'url', 'identifier': '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX, Dataset().id, VERSION), 'relation': 'isSupplementTo' }, ], }) if Dataset().metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': Dataset().metadata.url, 'relation': 'isAlternateIdentifier' }) md['description'] = html.escape( DESCRIPTION.format( GITHUB_PREFIX, Dataset().id, Dataset().metadata.url if Dataset().metadata.url else '', VERSION)) license_md = '' if Dataset().metadata.zenodo_license: md['license'] = {'id': Dataset().metadata.zenodo_license} license_md = LISENCE.format(Dataset().metadata.zenodo_license) DataDir(release_dir).write( 'README.md', RELEASE_NOTE.format(md['title'], GITHUB_PREFIX, Dataset().id, Dataset().metadata.title, license_md)) if args.update_zenodo: assert release_dir.exists() assert (release_dir / zenodo_file_name).exists() md = {} md.update(jsonlib.load(release_dir / zenodo_file_name)) api_url = API_URL zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN) rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != zenodoclient.models.PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def includeme(config): """Upgrading: - register utilities "by hand", after config.include('clld.web.app') - add routes by hand (and remove these from the **kw passed to Configurator) :param config: :return: """ # # now we exploit the default package layout as created via the CLLD scaffold: # # note: the following exploits the import time side effect of modifying the webassets # environment! root_package = config.root_package.__name__ pkg_dir = Path(config.root_package.__file__).parent.resolve() maybe_import('%s.assets' % root_package, pkg_dir=pkg_dir) json_renderer = JSON() json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('json', json_renderer) jsonp_renderer = JSONP(param_name='callback') jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat()) jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat()) config.add_renderer('jsonp', jsonp_renderer) config.set_request_factory(ClldRequest) config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery) config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig) config.registry.registerUtility(CldfConfig(), interfaces.ICldfConfig) # initialize the db connection engine = engine_from_config(config.registry.settings, 'sqlalchemy.') DBSession.configure(bind=engine) Base.metadata.bind = engine try: git_tag = git_describe(Path(pkg_dir).parent) except ValueError: # pragma: no cover git_tag = None config.add_settings({ 'pyramid.default_locale_name': 'en', 'clld.pkg': root_package, 'clld.git_tag': git_tag, 'clld.parameters': {}}) if 'clld.files' in config.registry.settings: # deployment-specific location of static data files abspath = Path(config.registry.settings['clld.files']).resolve() config.add_settings({'clld.files': abspath}) config.add_static_view('files', str(abspath)) # event subscribers: config.add_subscriber(add_localizer, events.NewRequest) config.add_subscriber(init_map, events.ContextFound) config.add_subscriber( partial( add_renderer_globals, maybe_import('%s.util' % root_package, pkg_dir=pkg_dir)), events.BeforeRender) # # make it easy to register custom functionality # for name, func in { 'register_utility': register_utility, 'register_datatable': partial(register_cls, interfaces.IDataTable), 'register_map': partial(register_cls, interfaces.IMap), 'register_menu': register_menu, 'register_resource': register_resource, 'register_adapter': register_adapter, 'register_adapters': register_adapters, 'register_download': register_download, 'register_staticresource': register_staticresource, 'add_route_and_view': add_route_and_view, 'add_settings_from_file': add_settings_from_file, 'add_301': add_301, 'add_410': add_410, 'add_page': add_page, 'register_resource_routes_and_views': register_resource_routes_and_views, }.items(): config.add_directive(name, func) # # routes and views # config.add_static_view('clld-static', 'clld:web/static') config.add_static_view('static', '%s:static' % root_package) config.add_route_and_view('_js', '/_js', js, http_cache=3600) # add some maintenance hatches config.add_route_and_view('_raise', '/_raise', _raise) config.add_route_and_view('_ping', '/_ping', _ping, renderer='json') # sitemap support: config.add_route_and_view('robots', '/robots.txt', robots) config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex) config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap) config.add_route('resourcemap', '/resourcemap.json') config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp') config.add_route_and_view( 'select_combination', '/_select_combination', select_combination) config.add_route_and_view('unapi', '/unapi', unapi) config.add_route_and_view('olac', '/olac', olac) config.add_settings_from_file(pkg_dir.joinpath('appconf.ini')) if not config.registry.settings.get('mako.directories'): config.add_settings({'mako.directories': ['clld:web/templates']}) for rsc in RESOURCES: config.register_resource_routes_and_views(rsc) config.register_datatable( rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable)) register_resource_adapters(config, rsc) # maps config.register_map('languages', Map) config.register_map('language', LanguageMap) config.register_map('parameter', ParameterMap) config.register_map('combination', CombinationMap) config.include('clld.web.adapters') for icon in ICONS: config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name) config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList) config.registry.registerUtility(MapMarker(), interfaces.IMapMarker) # # inspect default locations for views and templates: # home_comp = OrderedDict() for name, template in [ ('introduction', False), ('about', False), ('terms', False), ('glossary', False), ('history', False), ('changes', False), ('credits', False), ('legal', True), ('download', True), ('contact', True), ('help', False), ]: home_comp[name] = template if pkg_dir.joinpath('templates').exists(): for p in pkg_dir.joinpath('templates').iterdir(): if p.stem in home_comp and p.suffix == '.mako': home_comp[p.stem] = True for name, template in home_comp.items(): if template: config.add_page(name) config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]}) if 'clld.favicon' not in config.registry.settings: favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'} # hard to test (in particular on travis) and without too much consequence # (and the consequences faced are easy to spot). if pkg_dir.joinpath('static', 'favicon.ico').exists(): # pragma: no cover favicon['clld.favicon'] = root_package + ':static/favicon.ico' config.add_settings(favicon) config.add_settings({ 'clld.favicon_hash': md5(abspath_from_asset_spec( config.registry.settings['clld.favicon']))}) translation_dirs = ['clld:locale'] if pkg_dir.joinpath('locale').exists(): translation_dirs.append('%s:locale' % root_package) # pragma: no cover config.add_translation_dirs(*translation_dirs) if pkg_dir.joinpath('static/publisher_logo.png').exists(): # pragma: no cover config.add_settings( {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package}) if asbool(config.registry.settings.get('clld.pacific_centered_maps')): geojson.pacific_centered() v = maybe_import('%s.views' % root_package, pkg_dir=pkg_dir) if v: config.scan(v) # pragma: no cover menuitems = config.registry.settings.get( 'clld.menuitems_list', ['contributions', 'parameters', 'languages', 'contributors']) config.register_menu(('dataset', dict(label='Home')), *menuitems) config.include('pyramid_mako') for name in ['adapters', 'datatables', 'maps']: mod = maybe_import('%s.%s' % (root_package, name), pkg_dir=pkg_dir) if mod and hasattr(mod, 'includeme'): config.include(mod) config.register_download(CldfDownload(common.Dataset, root_package))
# coding: utf8 from __future__ import unicode_literals, print_function, division from six.moves.urllib.request import urlretrieve from clldutils.jsonlib import load from clldutils.path import Path, md5 import glottolog3 DOWNLOAD_DIR = Path(glottolog3.__file__).parent.joinpath('static', 'download') for rel, spec in load(DOWNLOAD_DIR.parent / 'downloads.json').items(): d = DOWNLOAD_DIR / rel if not d.exists(): d.mkdir() for bs in spec['bitstreams']: url = 'https://cdstar.shh.mpg.de//bitstreams/{0}/{1}'.format( spec['oid'], bs['bitstreamid']) target = d.joinpath(bs['bitstreamid'].replace('_', '-')) if (not target.exists()) or bs['checksum'] != md5(target): print('retrieving {0} {1}'.format(rel, target)) urlretrieve(url, str(target))
def run(args): ds = get_dataset(args) ds_cldf = ds.cldf_reader() release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) if ds_cldf.get('media.csv', None) is None: # pragma: no cover args.log.error('Dataset has no media.csv') raise ParserError if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): args.log.error('Invalid passed DOI') raise ParserError if args.update_zenodo: if not release_dir.exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir)) raise ParserError if not (release_dir / ZENODO_FILE_NAME).exists(): args.log.error( '"{0}" not found -- run --create-release first?'.format( release_dir / ZENODO_FILE_NAME)) raise ParserError if args.create_release: args.log.error( 'You cannot create the release and update zenodo at the same time.' ) raise ParserError if args.create_release: if not args.parent_doi: args.log.error( 'The corresponding DOI is required (via --parent-doi).') raise ParserError mime_types = None if args.mimetype: mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.list: size = collections.Counter() number = collections.Counter() else: media_dir = args.out / MEDIA media_dir.mkdir(exist_ok=True) media = [] if not args.update_zenodo: used_file_extensions = set() with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: for i, row in enumerate( tqdm.tqdm([r for r in ds_cldf['media.csv']], desc='Getting {0} items'.format(MEDIA))): url = ds_cldf.get_row_url('media.csv', row) if isinstance(url, rfc3986.URIReference): url = url.normalize().unsplit() row['URL'] = url f_ext = url.split('.')[-1].lower() if args.debug and i > 500: break if (mime_types is None) or f_ext in mime_types\ or any(row['mimetype'].startswith(x) for x in mime_types): if args.list: m = '{0} ({1})'.format(row['mimetype'], f_ext) size[m] += int(row['size']) number.update([m]) else: used_file_extensions.add(f_ext.lower()) d = media_dir / row['ID'][:2] d.mkdir(exist_ok=True) fn = '.'.join([row['ID'], f_ext]) target = d / fn row['local_path'] = pathlib.Path(row['ID'][:2]) / fn if i == 0: w.writerow(row) w.writerow(row.values()) media.append(target) if (not target.exists()) or md5(target) != row['ID']: _create_download_thread(url, target) if args.list: for k, v in size.most_common(): print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) return # Waiting for the download threads to finish if 'download_threads' in globals(): for t in download_threads: t.join() if args.create_release: assert media_dir.exists(), 'No folder "{0}" found in {1}'.format( MEDIA, media_dir.resolve()) release_dir.mkdir(exist_ok=True) media.append(media_dir / INDEX_CSV) try: zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) fp = args.out for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) zipf.close() except Exception as e: args.log.error(e) raise def _contrib(d): return { k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'} } version_v = git_describe('.').split('-')[0] version = version_v.replace('v', '') git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') with jsonlib.update(release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: contribs = ds.dir / 'CONTRIBUTORS.md' creators, contributors = get_creators_and_contributors( contribs.read_text( encoding='utf8') if contribs.exists() else '', strict=False) if creators: md['creators'] = [_contrib(p) for p in creators] if contributors: md['contributors'] = [_contrib(p) for p in contributors] communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] + \ COMMUNITIES if communities and not args.debug: md['communities'] = [{ "identifier": community_id } for community_id in sorted(set(communities))] md.update({ 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), 'access_right': 'open', 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), 'upload_type': 'dataset', 'publication_date': datetime.today().strftime('%Y-%m-%d'), 'version': version, 'related_identifiers': [ { 'scheme': 'url', 'identifier': '{0}/tree/{1}'.format(git_url, version_v), 'relation': 'isSupplementTo' }, ], }) if args.parent_doi: md['related_identifiers'].append({ 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf' }) supplement_to = " - Supplement to dataset " \ "<a href='https://doi.org/{0}'>{1}</a> ".format( args.parent_doi, ds.metadata.title) # noqa: E122 if ds.metadata.url: md['related_identifiers'].append({ 'scheme': 'url', 'identifier': ds.metadata.url, 'relation': 'isAlternateIdentifier' }) formats = ', '.join(sorted(used_file_extensions)) descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else '' online_url, online = '', '' if ds.metadata.url: online_url = ds.metadata.url online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format( online_url) md['description'] = html.escape( DESCRIPTION.format( url=online_url, formats=' ({0})'.format(formats) if formats else '', title=md['title'], supplement_to=supplement_to, descr=descr, online=online)) license_md = '' if ds.metadata.zenodo_license: md['license'] = {'id': ds.metadata.zenodo_license} license_md = LICENCE.format(ds.metadata.zenodo_license) DataDir(release_dir).write( 'README.md', README.format( title=md['title'], doi='https://doi.org/{0}'.format(args.parent_doi), ds_title=ds.metadata.title, license=license_md, formats=' ({0})'.format(formats) if formats else '', media=MEDIA, index=INDEX_CSV)) if args.update_zenodo: md = {} md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME)) if args.debug: api_url = API_URL_SANDBOX access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') else: api_url = API_URL access_token = ACCESS_TOKEN zenodo_url = api_url.replace('api/', '') args.log.info('Updating Deposit ID {0} on {1} with:'.format( args.update_zenodo, zenodo_url)) api = Zenodo(api_url=api_url, access_token=access_token) try: rec = api.record_from_id('{0}record/{1}'.format( zenodo_url, args.update_zenodo)) except Exception as e: args.log.error( 'Check connection and credentials for accessing Zenodo.\n{0}'. format(e)) return latest_version = rec.links['latest'].split('/')[-1] if latest_version != args.update_zenodo: args.log.warn( 'Passed deposit ID does not refer to latest version {0}!'. format(latest_version)) args.log.info(' DOI: ' + rec.metadata.doi) args.log.info(' Title: ' + rec.metadata.title) args.log.info(' Version: ' + rec.metadata.version) args.log.info(' Date: ' + rec.metadata.publication_date) args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) p = input("Proceed? [y/N]: ") if p.lower() == 'y': dep = api.update_deposit(args.update_zenodo, **md) if dep.state != PUBLISHED: api.publish_deposit(dep) args.log.info('Updated successfully')
def exists(self, repos: 'GEOROC') -> bool: """ Checks whether the specified file exists with correct checksum in the repository. """ p = repos.csvdir / self.name return p.exists() and md5(p) == self.md5
def run(args): # # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams # rather than creating a new object! # dlfname = args.pkg_dir.joinpath('static', 'downloads.json') downloads = load(dlfname) release = args.version title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads') with args.catalog_class(args.catalog, args.url, args.user, args.pwd) as cat: # # FIXME: there must be a way to overwrite old releases in case of bugfixes! # if release in downloads: print('adding bitstreams to {0}'.format(downloads[release]['oid'])) # This is a bugfix release, we don't have to create a new object on CDSTAR! obj = cat.api.get_object(uid=downloads[release]['oid']) else: obj = cat.api.get_object() obj.metadata = { "creator": "pycdstar", "title": "glottolog %s - downloads" % release, "description": "Custom downloads for release %s of " "[Glottolog](http://glottolog.org)" % release, } bitstreams = obj.bitstreams[:] for fname in args.pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): bsname = fname.name.replace('-', '_') bitstream, skip = None, False for bitstream in bitstreams: if bitstream.id == bsname: break else: bitstream = None if bitstream: if bitstream._properties['checksum'] != md5(fname): bitstream.delete() else: skip = True print('skipping {0}'.format(fname.name)) if not skip: print(fname.name) obj.add_bitstream(fname=fname.as_posix(), name=bsname) obj.read() cat.add(obj, update=True) with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads: for oid, spec in load(args.catalog).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if (match.group('version') not in downloads ) or match.group('version') == release: args.log.info('update info for release {0}'.format( match.group('version'))) spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(dlfname)) args.log.info('{0}'.format(args.catalog))
def test_md5(self): from clldutils.path import md5 self.assertIsNotNone(re.match('[a-f0-9]{32}$', md5(__file__)))
def test_md5(): from clldutils.path import md5 assert re.match('[a-f0-9]{32}$', md5(__file__))
def cdstar(args): try: from cdstarcat.catalog import Catalog except ImportError: args.log.error('pip install cdstarcat') return # # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams # rather than creating a new object! # dlfname = args.pkg_dir.joinpath('static', 'downloads.json') downloads = load(dlfname) release = args.args[0] title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads') with Catalog( Path(os.environ['CDSTAR_CATALOG']), cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: # # FIXME: there must be a way to overwrite old releases in case of bugfixes! # if release in downloads: # This is a bugfix release, we don't have to create a new object on CDSTAR! obj = cat.api.get_object(uid=downloads[release]['oid']) else: obj = cat.api.get_object() obj.metadata = { "creator": "pycdstar", "title": "glottolog %s - downloads" % release, "description": "Custom downloads for release %s of " "[Glottolog](http://glottolog.org)" % release, } bitstreams = obj.bitstreams[:] for fname in args.pkg_dir.joinpath('static', 'download').iterdir(): if fname.is_file() and not fname.name.startswith('.'): bsname = fname.name.replace('-', '_') bitstream, skip = None, False for bitstream in bitstreams: if bitstream.id == bsname: break else: bitstream = None if bitstream: if bitstream._properties['checksum'] != md5(fname): bitstream.delete() else: skip = True print('skipping {0}'.format(fname.name)) if not skip: print(fname.name) obj.add_bitstream(fname=fname.as_posix(), name=bsname) cat.add(obj, update=True) with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads: for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items(): if 'metadata' in spec and 'title' in spec['metadata']: match = title_pattern.match(spec['metadata']['title']) if match: if (match.group('version') not in downloads) or match.group('version') == release: args.log.info('update info for release {0}'.format(match.group('version'))) spec['oid'] = oid downloads[match.group('version')] = spec args.log.info('{0} written'.format(dlfname)) args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))