from tqdm import tqdm def get_remote_urls(rec, remote): for k, v in rec.items(): if v.get('description', '') in [remote, '[%s]' % remote]: return v.get('urls', []) return [] if __name__ == '__main__': annex = AnnexRepo('.', create=False, init=False) # enable datalad special remote urls_to_register = defaultdict(list) # key: urls try: annex.call_annex(["enableremote", "datalad"]) # go through each and see where urls aren't yet under web # seems might have also --in=datalad to restrict w = annex.whereis([], options=['--all'], output='full') lgr.info("Got %d entries", len(w)) for k, rec in tqdm(w.items()): datalad_urls = get_remote_urls(rec, 'datalad') web_urls = set(get_remote_urls(rec, 'web')) for url in datalad_urls: if url not in web_urls: if 'openneuro.s3' in url or 'openfmri.s3' in url: urls_to_register[k].append(url) else: lgr.warning("Found unexpected url %s" % url) finally:
def postclonecfg_annexdataset(ds, reckless, description=None): """If ds "knows annex" -- annex init it, set into reckless etc Provides additional tune up to a possibly an annex repo, e.g. "enables" reckless mode, sets up description """ # in any case check whether we need to annex-init the installed thing: if not knows_annex(ds.path): # not for us return # init annex when traces of a remote annex can be detected if reckless == 'auto': lgr.debug( "Instruct annex to hardlink content in %s from local " "sources, if possible (reckless)", ds.path) ds.config.set( 'annex.hardlink', 'true', where='local', reload=True) lgr.debug("Initializing annex repo at %s", ds.path) # Note, that we cannot enforce annex-init via AnnexRepo(). # If such an instance already exists, its __init__ will not be executed. # Therefore do quick test once we have an object and decide whether to call # its _init(). # # Additionally, call init if we need to add a description (see #1403), # since AnnexRepo.__init__ can only do it with create=True repo = AnnexRepo(ds.path, init=True) if not repo.is_initialized() or description: repo._init(description=description) if reckless == 'auto' or (reckless and reckless.startswith('shared-')): repo.call_annex(['untrust', 'here']) elif reckless == 'ephemeral': # with ephemeral we declare 'here' as 'dead' right away, whenever # we symlink origin's annex, since availability from 'here' should # not be propagated for an ephemeral clone when we publish back to # origin. # This will cause stuff like this for a locally present annexed file: # % git annex whereis d1 # whereis d1 (0 copies) failed # BUT this works: # % git annex find . --not --in here # % git annex find . --in here # d1 # we don't want annex copy-to origin ds.config.set( 'remote.origin.annex-ignore', 'true', where='local') ds.repo.set_remote_dead('here') if check_symlink_capability(ds.repo.dot_git / 'dl_link_test', ds.repo.dot_git / 'dl_target_test'): # symlink the annex to avoid needless copies in an ephemeral clone annex_dir = ds.repo.dot_git / 'annex' origin_annex_url = ds.config.get("remote.origin.url", None) origin_git_path = None if origin_annex_url: try: # Deal with file:// scheme URLs as well as plain paths. # If origin isn't local, we have nothing to do. origin_git_path = Path(RI(origin_annex_url).localpath) # we are local; check for a bare repo first to not mess w/ # the path if GitRepo(origin_git_path, create=False).bare: # origin is a bare repo -> use path as is pass elif origin_git_path.name != '.git': origin_git_path /= '.git' except ValueError: # Note, that accessing localpath on a non-local RI throws # ValueError rather than resulting in an AttributeError. # TODO: Warning level okay or is info level sufficient? # Note, that setting annex-dead is independent of # symlinking .git/annex. It might still make sense to # have an ephemeral clone that doesn't propagate its avail. # info. Therefore don't fail altogether. lgr.warning("reckless=ephemeral mode: origin doesn't seem " "local: %s\nno symlinks being used", origin_annex_url) if origin_git_path: # TODO make sure that we do not delete any unique data rmtree(str(annex_dir)) \ if not annex_dir.is_symlink() else annex_dir.unlink() annex_dir.symlink_to(origin_git_path / 'annex', target_is_directory=True) else: # TODO: What level? + note, that annex-dead is independ lgr.warning("reckless=ephemeral mode: Unable to create symlinks on " "this file system.") srs = {True: [], False: []} # special remotes by "autoenable" key remote_uuids = None # might be necessary to discover known UUIDs repo_config = repo.config # Note: The purpose of this function is to inform the user. So if something # looks misconfigured, we'll warn and move on to the next item. for uuid, config in repo.get_special_remotes().items(): sr_name = config.get('name', None) if sr_name is None: lgr.warning( 'Ignoring special remote %s because it does not have a name. ' 'Known information: %s', uuid, config) continue sr_autoenable = config.get('autoenable', False) try: sr_autoenable = ensure_bool(sr_autoenable) except ValueError: lgr.warning( 'Failed to process "autoenable" value %r for sibling %s in ' 'dataset %s as bool.' 'You might need to enable it later manually and/or fix it up to' ' avoid this message in the future.', sr_autoenable, sr_name, ds.path) continue # If it looks like a type=git special remote, make sure we have up to # date information. See gh-2897. if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)): try: repo.fetch(remote=sr_name) except CommandError as exc: lgr.warning("Failed to fetch type=git special remote %s: %s", sr_name, exc_str(exc)) # determine whether there is a registered remote with matching UUID if uuid: if remote_uuids is None: remote_uuids = { # Check annex-config-uuid first. For sameas annex remotes, # this will point to the UUID for the configuration (i.e. # the key returned by get_special_remotes) rather than the # shared UUID. (repo_config.get('remote.%s.annex-config-uuid' % r) or repo_config.get('remote.%s.annex-uuid' % r)) for r in repo.get_remotes() } if uuid not in remote_uuids: srs[sr_autoenable].append(sr_name) if srs[True]: lgr.debug( "configuration for %s %s added because of autoenable," " but no UUIDs for them yet known for dataset %s", # since we are only at debug level, we could call things their # proper names single_or_plural("special remote", "special remotes", len(srs[True]), True), ", ".join(srs[True]), ds.path ) if srs[False]: # if has no auto-enable special remotes lgr.info( 'access to %s %s not auto-enabled, enable with:\n' '\t\tdatalad siblings -d "%s" enable -s %s', # but since humans might read it, we better confuse them with our # own terms! single_or_plural("dataset sibling", "dataset siblings", len(srs[False]), True), ", ".join(srs[False]), ds.path, srs[False][0] if len(srs[False]) == 1 else "SIBLING", ) # we have just cloned the repo, so it has 'origin', configure any # reachable origin of origins yield from configure_origins(ds, ds)
def test_add_archive_content(path_orig, url, repo_path): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API assert_raises(RuntimeError, add_archive_content, "nonexisting.tar.gz") # no repo yet repo = AnnexRepo(repo_path, create=True) assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz") # we can't add a file from outside the repo ATM assert_raises(FileNotInRepositoryError, add_archive_content, opj(path_orig, '1.tar.gz')) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) for s in range(1, 5): repo.add_urls([opj(url, '%du/1.tar.gz' % s)], options=["--pathdepth", "-2"]) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_key( '1.tar.gz') # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex repo_ = add_archive_content('1.tar.gz') eq_(repo.path, repo_.path) d1_basic_checks() # If ran again, should proceed just fine since the content is the same so no changes would be made really add_archive_content('1.tar.gz') # But that other one carries updated file, so should fail due to overwrite with assert_raises(RuntimeError) as cme: add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True) # TODO: somewhat not precise since we have two possible "already exists" # -- in caching and overwrite check assert_in("already exists", str(cme.exception)) # but should do fine if overrides are allowed add_archive_content(opj('1u', '1.tar.gz'), existing='overwrite', use_current_dir=True) add_archive_content(opj('2u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('3u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('4u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options add_archive_content('1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_urls([opj(url, 'd1', '1.tar.gz')], options=["--pathdepth", "-1"], cwd=getpwd()) # invoke under current subdir repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz') d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=repo.path) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))