def get(self, key_path, filename, progress_cb): # Note, that we need the path with hash dirs, since we don't have access # to annexremote.dirhash from within IO classes url = self.base_url + "/annex/objects/" + str(key_path) from datalad.support.network import download_url download_url(url, filename, overwrite=True)
def test_download_url(toppath, topurl): furl = "%sfile.dat" % topurl # fails if URL is dysfunctional assert_raises(DownloadError, download_url, furl + 'magic', toppath) # working download tfpath = opj(toppath, "file-downloaded.dat") download_url(furl, tfpath) ok_file_has_content(tfpath, 'abc') # fails if destfile exists assert_raises(DownloadError, download_url, furl, tfpath) # works when forced download_url(furl, tfpath, overwrite=True)
def postclonecfg_ria(ds, props): """Configure a dataset freshly cloned from a RIA store""" repo = ds.repo # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via # ssh) would make it see a bare repo and establish a hashdir lower annex # object tree. # Moreover, we want the ORA remote to receive all data for the store, so its # objects could be moved into archives (the main point of a RIA store). RIA_REMOTE_NAME = 'origin' # don't hardcode everywhere ds.config.set( 'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true', where='local') # chances are that if this dataset came from a RIA store, its subdatasets # may live there too. Place a subdataset source candidate config that makes # get probe this RIA store when obtaining subdatasets ds.config.set( # we use the label 'origin' for this candidate in order to not have to # generate a complicated name from the actual source specification. # we pick a cost of 200 to sort it before datalad's default candidates # for non-RIA URLs, because they prioritize hierarchical layouts that # cannot be found in a RIA store 'datalad.get.subdataset-source-candidate-200origin', # use the entire original URL, up to the fragment + plus dataset ID # placeholder, this should make things work with any store setup we # support (paths, ports, ...) props['source'].split('#', maxsplit=1)[0] + '#{id}', where='local') # setup publication dependency, if a corresponding special remote exists # and was enabled (there could be RIA stores that actually only have repos) # make this function be a generator ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype') == 'ora'] if not ora_remotes and any( r.get('externaltype') == 'ora' for r in (repo.get_special_remotes().values() if hasattr(repo, 'get_special_remotes') else [])): # no ORA remote autoenabled, but configuration known about at least one. # Let's check origin's config for datalad.ora-remote.uuid as stored by # create-sibling-ria and enable try enabling that one. lgr.debug("Found no autoenabled ORA special remote. Trying to look it " "up in source config ...") # First figure whether we cloned via SSH, HTTP or local path and then # get that config file the same way: config_content = None scheme = props['giturl'].split(':', 1)[0] if scheme in ['http', 'https']: try: config_content = download_url( "{}{}config".format( props['giturl'], '/' if not props['giturl'].endswith('/') else '')) except DownloadError as e: lgr.debug("Failed to get config file from source:\n%s", exc_str(e)) elif scheme == 'ssh': # TODO: switch the following to proper command abstraction: # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be # changed with command abstractions). So we need to get that part to # have a valid path to origin's config file: cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config' op = SSHRemoteIO(props['giturl']) try: config_content = op.read_file(cfg_path) except RIARemoteError as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) elif scheme == 'file': # TODO: switch the following to proper command abstraction: op = LocalIO() cfg_path = Path(URL(props['giturl']).localpath) / 'config' try: config_content = op.read_file(cfg_path) except (RIARemoteError, OSError) as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) else: lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or " "FILE scheme URLs.", scheme, props['source']) # 3. And read it org_uuid = None if config_content: # TODO: We might be able to spare the saving to a file. # "git config -f -" is not explicitly documented but happens # to work and would read from stdin. Make sure we know this # works for required git versions and on all platforms. with make_tempfile(content=config_content) as cfg_file: runner = GitWitlessRunner() try: result = runner.run( ['git', 'config', '-f', cfg_file, 'datalad.ora-remote.uuid'], protocol=StdOutCapture ) org_uuid = result['stdout'].strip() except CommandError as e: # doesn't contain what we are looking for lgr.debug("Found no UUID for ORA special remote at " "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e)) # Now, enable it. If annex-init didn't fail to enable it as stored, we # wouldn't end up here, so enable with store URL as suggested by the URL # we cloned from. if org_uuid: srs = repo.get_special_remotes() if org_uuid in srs.keys(): # TODO: - Double-check autoenable value and only do this when # true? # - What if still fails? -> Annex shouldn't change config # in that case # we only need the store: new_url = props['source'].split('#')[0] try: repo.enable_remote(srs[org_uuid]['name'], options=['url={}'.format(new_url)] ) lgr.info("Reconfigured %s for %s", srs[org_uuid]['name'], new_url) # update ora_remotes for considering publication dependency # below ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype', None) == 'ora'] except CommandError as e: lgr.debug("Failed to reconfigure ORA special remote: %s", exc_str(e)) else: lgr.debug("Unknown ORA special remote uuid at '%s': %s", RIA_REMOTE_NAME, org_uuid) if ora_remotes: if len(ora_remotes) == 1: yield from ds.siblings('configure', name=RIA_REMOTE_NAME, publish_depends=ora_remotes[0]['name'], result_filter=None, result_renderer='disabled') else: lgr.warning("Found multiple ORA remotes. Couldn't decide which " "publishing to 'origin' should depend on: %s. Consider " "running 'datalad siblings configure -s origin " "--publish-depends ORAREMOTENAME' to set publication " "dependency manually.", [r['name'] for r in ora_remotes])
#!/usr/bin/env python3 import os.path as op import sys import xml.dom.minidom import datalad.api as dl from datalad.support.network import download_url ds = dl.Dataset(op.dirname(op.dirname(op.realpath(__file__)))) if 'datalad' not in ds.repo.get_remotes(): from datalad.customremotes.base import init_datalad_remote init_datalad_remote(ds.repo, 'datalad', autoenable=True) # doc = xml.dom.minidom.parse('/tmp/outi-7T.xml') topurl = 'https://db.humanconnectome.org/data/archive/projects/HCP_Resources/resources/7T_Movies/' doc = xml.dom.minidom.parseString(download_url(topurl)) files = [{f: e.getAttribute(f) for f in ('ID', 'URI', 'digest', 'name')} for e in doc.getElementsByTagName("cat:entry")] # from pprint import pprint # pprint(files) added = list( ds.addurls(files, topurl + 'files/{URI}', '{URI}', fast=False, save=False)) print(f"Processed {len(added)} entries")