def test_backup_archive(path, objtree, archivremote): """Similar to test_archive_layout(), but not focused on compatibility with the directory-type special remote. Instead, it tests build a second RIA remote from an existing one, e.g. for backup purposes. """ ds = create(path) setup_archive_remote(ds.repo, objtree) populate_dataset(ds) ds.save() assert_repo_status(ds.path) # copy files into the RIA archive ds.repo.copy_to('.', 'archive') targetpath = Path(archivremote) / ds.id[:3] / ds.id[3:] / 'archives' targetpath.mkdir(parents=True) subprocess.run( ['7z', 'u', str(targetpath / 'archive.7z'), '.'], cwd=str(Path(objtree) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects'), ) initexternalremote(ds.repo, '7z', 'ria', config={'base-path': archivremote}) # wipe out the initial RIA remote (just for testing if the upcoming # one can fully take over) shutil.rmtree(objtree) # fsck to make git-annex aware of the loss assert_status('error', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='archive', fast=True) ]) # now only available "here" eq_(len(ds.repo.whereis('one.txt')), 1) # make the backup archive known initexternalremote(ds.repo, 'backup', 'ria', config={'base-path': archivremote}) # now fsck the new remote to get the new special remote indexed assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='backup', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])
def add_meta(rows): """Call `git annex metadata --set` using information in `rows`. """ from unittest.mock import patch for row in rows: ds, filename = row["ds"], row["ds_filename"] with patch.object(ds.repo, "always_commit", False): res = ds.repo.add(filename) res_status = 'notneeded' if not res \ else 'ok' if res.get('success', False) \ else 'error' yield dict( action='add', # decorator dies with Path() path=str(ds.pathobj / filename), type='file', status=res_status, parentds=ds.path, ) lgr.debug("Adding metadata to %s in %s", filename, ds.path) for a in ds.repo.set_metadata_(filename, add=row["meta_args"]): res = annexjson2result(a, ds, type="file", logger=lgr) # Don't show all added metadata for the file because that # could quickly flood the output. del res["message"] yield res
def add_meta(rows): """Call `git annex metadata --set` using information in `rows`. """ from mock import patch for row in rows: ds, filename = row["ds"], row["ds_filename"] with patch.object(ds.repo, "always_commit", False): res = ds.repo.add(filename) res_status = 'notneeded' if not res \ else 'ok' if res.get('success', False) \ else 'error' yield dict( action='add', # decorator dies with Path() path=text_type(ds.pathobj / filename), type='file', status=res_status, parentds=ds.path, ) lgr.debug("Adding metadata to %s in %s", filename, ds.path) for a in ds.repo.set_metadata_(filename, add=row["meta_args"]): res = annexjson2result(a, ds, type="file", logger=lgr) # Don't show all added metadata for the file because that # could quickly flood the output. del res["message"] yield res
def test_annexjson2result(dspath): # no explicit success means 'error' eq_(annexjson2result(dict(), None), dict(status='error')) # unrecognized -> error eq_(annexjson2result(dict(success='random'), None), dict(status='error')) # success is possible ;-) eq_(annexjson2result(dict(success=True), None), dict(status='ok')) # path handling # needs a dataset ds = Dataset(dspath) eq_(annexjson2result(dict(file='file1'), ds), dict(status='error', path=str(ds.pathobj / 'file1'))) # on all platforms, paths are reported in platform conventions # although git-annex reports in posix eq_(annexjson2result(dict(file='dir1/file1'), ds), dict(status='error', path=str(ds.pathobj / 'dir1' / 'file1')))
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _get_targetpaths(ds, content, refds_path, source, jobs): # not ready for Path instances... content = [str(c) for c in content] # hand over to git-annex, get files content, # report files in git as 'notneeded' to get ds_repo = ds.repo # needs to be an annex to get content if not isinstance(ds_repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', type='file', logger=lgr, refds=refds_path): yield r return respath_by_status = {} try: results = ds_repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs) except CommandError as exc: results = exc.kwargs.get("stdout_json") if not results: raise for res in results: res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def add_meta(rows): """Call `git annex metadata --set` using information in `rows`. """ for row in rows: ds, filename = row["ds"], row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) for a in ds.repo.set_metadata(filename, add=row["meta_args"]): res = annexjson2result(a, ds, type="file", logger=lgr) # Don't show all added metadata for the file because that # could quickly flood the output. del res["message"] yield res
def add_meta(rows): """Call `git annex metadata --set` using information in `rows`. """ from mock import patch for row in rows: ds, filename = row["ds"], row["ds_filename"] with patch.object(ds.repo, "always_commit", False): lgr.debug("Adding metadata to %s in %s", filename, ds.path) for a in ds.repo.set_metadata(filename, add=row["meta_args"]): res = annexjson2result(a, ds, type="file", logger=lgr) # Don't show all added metadata for the file because that # could quickly flood the output. del res["message"] yield res
def _postproc_result(res, respath_by_status, ds, **kwargs): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] if res["status"] == "error" and res["action"] == "drop": msg = res["message"] if isinstance(msg, str) and "Use --force to" in msg: # Avoid confusing datalad-drop callers with git-annex-drop's # suggestion to use --force. res["message"] = msg.replace("--force", "--nocheck") return res
def add_meta(rows): """Call `git annex metadata --set` using information in `rows`. """ from unittest.mock import patch # OPT: group by dataset first so to not patch/unpatch always_commit # per each file of which we could have thousands for ds, ds_rows in itertools.groupby(rows, itemgetter("ds")): with patch.object(ds.repo, "always_commit", False): for row in ds_rows: filename = row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) for a in ds.repo.set_metadata_(filename, add=row["meta_args"]): res = annexjson2result(a, ds, type="file", logger=lgr) # Don't show all added metadata for the file because that # could quickly flood the output. del res["message"] yield res
def add_urls(rows, ifexists=None, options=None): """Call `git annex addurl` using information in `rows`. """ for row in rows: filename_abs = row["filename_abs"] ds, filename = row["ds"], row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) if os.path.exists(filename_abs) or os.path.islink(filename_abs): if ifexists == "skip": yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, status="notneeded") continue elif ifexists == "overwrite": lgr.debug("Removing %s", filename_abs) unlink(filename_abs) else: lgr.debug("File %s already exists", filename_abs) try: out_json = ds.repo.add_url_to_file(filename, row["url"], batch=True, options=options) except AnnexBatchCommandError as exc: yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, message=exc_str(exc), status="error") continue # In the case of an error, the json object has file=None. if out_json["file"] is None: out_json["file"] = filename_abs yield annexjson2result(out_json, ds, action="addurls", type="file", logger=lgr)
def _postproc_annexdrop_result(res, respath_by_status, ds, **kwargs): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] if res["status"] == "error" and res["action"] == "drop": msg = res.get("message", None) if isinstance(msg, str) and "Use --force to" in msg: # Avoid confusing datalad-drop callers with git-annex-drop's # suggestion to use --force. # Just mention reckless itself, do not go into the details # of which mode. This is likely changing over time and # adjusting this replacement will be forgotten. res["message"] = msg.replace("--force", "--reckless availability") return res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def _test_bare_git_version_1(host, dspath, store): # This test should take a dataset and create a bare repository at the remote # end from it. # Given, that it is placed correctly within a tree of dataset, that remote # thing should then be usable as an ora-remote as well as as a git-type # remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 1 (lower) upload and consumption should be # interchangeable. It doesn't matter which remote is used for what # direction. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run([ 'git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 1 (dirhash lower): create_ds_in_store(io, store, ds.id, '1', '1') # Now, let's have the bare repo as a git remote and use it with annex git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # copy files to the remote ds.repo.copy_to('.', 'bare-git') eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()]) # Now, add the ora remote: ds.repo.init_remote('ora-remote', options=init_opts) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) # Now move content from git-remote to local and see it not being available # via bare-git anymore. ds.repo.call_annex(['move', '--all', '--from=bare-git']) # ora-remote doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after fsck it does: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, subdir/two\n' '** was expected to be present, ' 'but its content is missing.') eq_(len(ds.repo.whereis('one.txt')), 1) # and the other way around: upload via ora-remote and have it available via # git-remote: ds.repo.copy_to('.', 'ora-remote') # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3)
def _test_bare_git_version_2(host, dspath, store): # Similarly to test_bare_git_version_1, this should ensure a bare git repo # at the store location for a dataset doesn't conflict with the ORA remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 2 (mixed) upload via ORA and consumption via git should # work. But not the other way around, since git-annex uses # dirhashlower with bare repos. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run([ 'git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 2 (dirhash mixed): create_ds_in_store(io, store, ds.id, '2', '1') # Now, let's have the bare repo as a git remote git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # and the ORA remote in addition: ds.repo.init_remote('ora-remote', options=init_opts) # upload keys via ORA: ds.repo.copy_to('.', 'ora-remote') # bare-git doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) ds.drop('.') eq_(len(ds.repo.whereis('one.txt')), 2) # actually consumable via git remote: ds.repo.call_annex(['move', 'one.txt', '--from', 'bare-git']) eq_(len(ds.repo.whereis('one.txt')), 2) # now, move back via git - shouldn't be consumable via ORA ds.repo.call_annex(['move', 'one.txt', '--to', 'bare-git']) # fsck to make availability known, but there's nothing from POV of ORA: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='ok') eq_(len(fsck_res), 2) eq_(len(ds.repo.whereis('one.txt')), 1)
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def test_create_as_bare(origin, remote_base_path, remote_base_url, public, consumer, tmp_location): # Note/TODO: Do we need things like: # git config receive.denyCurrentBranch updateInstead # mv .hooks/post-update.sample hooks/post-update # git update-server-info # Test how we build a riaremote from an existing dataset, that is a bare git repo and can be accessed as a git type # remote as well. This should basically outline how to publish to that kind of structure as a data store, that is # autoenabled, so we can publish to github/gitlab and make that storage known. remote_base_path = Path(remote_base_path) ds = create(origin) populate_dataset(ds) ds.save() assert_repo_status(ds.path) # add the ria remote: # Note: For serve_path_via_http to work (which we need later), the directory needs to already exist. # But by default RIARemote will reject to create the remote structure in an already existing directory, # that wasn't created by itself (lacks as ria-layout-version file). # So, we can either configure force-write here or put a version file in it beforehand. # However, this is specific to the test environment! with open(str(remote_base_path / 'ria-layout-version'), 'w') as f: f.write('1') initexternalremote(ds.repo, 'riaremote', 'ria', config={'base-path': str(remote_base_path)}) # pretty much any annex command that talks to that remote should now trigger the actual creation on the remote end: assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='riaremote', fast=True) ]) remote_dataset_path = remote_base_path / ds.id[:3] / ds.id[3:] assert remote_base_path.exists() assert remote_dataset_path.exists() ds.repo.copy_to('.', 'riaremote') # Now, let's make the remote end a valid, bare git repository eq_( subprocess.run(['git', 'init', '--bare'], cwd=str(remote_dataset_path)).returncode, 0) #subprocess.run(['mv', 'hooks/post-update.sample', 'hooks/post-update'], cwd=remote_dataset_path) #subprocess.run(['git', 'update-server-info'], cwd=remote_dataset_path) # TODO: we might need "mv .hooks/post-update.sample hooks/post-update", "git update-server-info" as well # add as git remote and push everything eq_( subprocess.run( ['git', 'remote', 'add', 'bare-git', str(remote_dataset_path)], cwd=origin).returncode, 0) # Note: "--mirror" does the job for this test, while it might not be a good default some kind of # datalad-create-sibling. However those things need to be configurable for actual publish/creation routine anyway eq_( subprocess.run(['git', 'push', '--mirror', 'bare-git'], cwd=origin).returncode, 0) # annex doesn't know the bare-git remote yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after enableremote and a fsck it does: eq_( subprocess.run(['git', 'annex', 'enableremote', 'bare-git'], cwd=origin).returncode, 0) assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) # we can drop and get again via 'bare-git' remote: ds.drop('.') eq_(len(ds.repo.whereis('one.txt')), 2) eq_( subprocess.run( ['git', 'annex', 'get', 'one.txt', '--from', 'bare-git'], cwd=origin).returncode, 0) eq_(len(ds.repo.whereis('one.txt')), 3) # let's get the other one from riaremote eq_(len(ds.repo.whereis(op.join('subdir', 'two'))), 2) eq_( subprocess.run([ 'git', 'annex', 'get', op.join('subdir', 'two'), '--from', 'riaremote' ], cwd=origin).returncode, 0) eq_(len(ds.repo.whereis(op.join('subdir', 'two'))), 3) raise SkipTest("NOT YET DONE")
def test_bare_git(origin, remote_base_path): remote_base_path = Path(remote_base_path) # This test should take a dataset and create a bare repository at the remote end from it. # Given, that it is placed correctly within a tree of dataset, that remote thing should then be usable as a # ria-remote as well as as a git-type remote ds = create(origin) populate_dataset(ds) ds.save() assert_repo_status(ds.path) # Use git to make sure the remote end is what git thinks a bare clone of it should look like bare_repo_path = remote_base_path / ds.id[:3] / ds.id[3:] subprocess.run(['git', 'clone', '--bare', origin, str(bare_repo_path)]) # Now, let's have the bare repo as a git remote and use it with annex eq_( subprocess.run( ['git', 'remote', 'add', 'bare-git', str(bare_repo_path)], cwd=origin).returncode, 0) eq_( subprocess.run(['git', 'annex', 'enableremote', 'bare-git'], cwd=origin).returncode, 0) eq_( subprocess.run(['git', 'annex', 'testremote', 'bare-git'], cwd=origin).returncode, 0) # copy files to the remote ds.repo.copy_to('.', 'bare-git') eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()]) # Since we created the remote this particular way instead of letting ria-remote create it, we need to put # ria-layout-version files into it. Then we should be able to also add it as a ria-remote. with open(str(remote_base_path / 'ria-layout-version'), 'w') as f: f.write('1') with open(str(bare_repo_path / 'ria-layout-version'), 'w') as f: f.write('1') # Now, add the ria remote: initexternalremote(ds.repo, 'riaremote', 'ria', config={'base-path': str(remote_base_path)}) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='riaremote', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) # Now move content from git-remote to local and see it not being available via bare-git anymore eq_( subprocess.run(['git', 'annex', 'move', '--all', '--from=bare-git'], cwd=origin).returncode, 0) # ria-remote doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after fsck it does: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='riaremote', fast=True) ] assert_result_count( fsck_res, 1, status='error', message= '** Based on the location log, one.txt\n** was expected to be present, ' 'but its content is missing.') assert_result_count( fsck_res, 1, status='error', message= '** Based on the location log, subdir/two\n** was expected to be present, ' 'but its content is missing.') eq_(len(ds.repo.whereis('one.txt')), 1)
def _push_data(ds, target, content, data, force, jobs, res_kwargs, got_path_arg=False): if ds.config.getbool('remote.{}'.format(target), 'annex-ignore', False): lgr.debug( "Target '%s' is set to annex-ignore, exclude from data-push.", target, ) return ds_repo = ds.repo res_kwargs['target'] = target if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None): # this remote either isn't an annex, # or hasn't been properly initialized # given that there was no annex-ignore, let's try to init it # see https://github.com/datalad/datalad/issues/5143 for the story ds_repo.localsync(target) if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None): # still nothing # rather than barfing tons of messages for each file, do one # for the entire dataset yield dict( res_kwargs, action='copy', status='impossible' if force in ('all', 'checkdatapresent') else 'notneeded', message=("Target '%s' does not appear to be an annex remote", target)) return # it really looks like we will transfer files, get info on what annex # has in store # paths must be recoded to a dataset REPO root (in case of a symlinked # location annex_info_init = \ {ds_repo.pathobj / Path(c['path']).relative_to(ds.pathobj): c for c in content} if ds.pathobj != ds_repo.pathobj else \ {Path(c['path']): c for c in content} content = ds.repo.get_content_annexinfo( # paths are taken from `annex_info_init` paths=None, init=annex_info_init, ref='HEAD', # this is an expensive operation that is only needed # to perform a warning below, and for more accurate # progress reporting (exclude unavailable content). # limit to cases with explicit paths provided eval_availability=True if got_path_arg else False, ) # figure out which of the reported content (after evaluating # `since` and `path` arguments needs transport to_transfer = [ c for c in content.values() # by force if (( force in ('all', 'checkdatapresent') or # or by modification report c.get('state', None) not in ('clean', 'deleted')) # only consider annex'ed files and 'key' in c) ] if got_path_arg: for c in [c for c in to_transfer if not c.get('has_content', False)]: yield dict( res_kwargs, type=c['type'], path=c['path'], action='copy', status='impossible', message='Slated for transport, but no content present', ) cmd = ['copy', '--batch', '-z', '--to', target] if jobs: cmd.extend(['--jobs', str(jobs)]) # Since we got here - we already have some data != "nothing" if (data == 'auto') or \ ( (data == 'auto-if-wanted') and ds_repo.get_preferred_content('wanted', target) ): lgr.debug("Invoking copy --auto") cmd.append('--auto') if force not in ('all', 'checkdatapresent'): # if we force, we do not trust local knowledge and do the checks cmd.append('--fast') lgr.debug("Push data from %s to '%s'", ds, target) # input has type=dataset, but now it is about files res_kwargs.pop('type', None) # A set and an OrderedDict is used to track files pointing to the # same key. The set could be dropped, using a single dictionary # that has an entry for every seen key and a (likely empty) list # of redundant files, but that would mean looping over potentially # many keys to yield likely few if any notneeded results. seen_keys = set() repkey_paths = OrderedDict() # produce final path list. use knowledge that annex command will # run in the root of the dataset and compact paths to be relative # to this location file_list = b'' nbytes = 0 for c in to_transfer: key = c['key'] if key in seen_keys: repkey_paths.setdefault(key, []).append(c['path']) else: file_list += bytes(Path(c['path']).relative_to(ds.pathobj)) file_list += b'\0' nbytes += c['bytesize'] seen_keys.add(key) lgr.debug('Counted %d bytes of annex data to transfer', nbytes) # and go res = ds_repo._call_annex_records( cmd, stdin=file_list, progress=True, # tailor the progress protocol with the total number of files # to be transferred total_nbytes=nbytes) for j in res: yield annexjson2result(j, ds, type='file', **res_kwargs) for annex_key, paths in repkey_paths.items(): for path in paths: yield dict(res_kwargs, action='copy', type='file', status='notneeded', path=path, annexkey=annex_key, message='Another file points to the same key') return
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def _push_data(ds, target, content, data, force, jobs, res_kwargs, got_path_arg=False): if ds.config.getbool('remote.{}'.format(target), 'annex-ignore', False): lgr.debug( "Target '%s' is set to annex-ignore, exclude from data-push.", target, ) return res_kwargs['target'] = target if not ds.config.get('.'.join(('remote', target, 'annex-uuid')), None): # this remote either isn't an annex, # or hasn't been properly initialized # rather than barfing tons of messages for each file, do one # for the entire dataset yield dict( res_kwargs, action='copy', status='impossible' if force in ('all', 'checkdatapresent') else 'notneeded', message=("Target '%s' does not appear to be an annex remote", target)) return # it really looks like we will transfer files, get info on what annex # has in store ds_repo = ds.repo # paths must be recoded to a dataset REPO root (in case of a symlinked # location annex_info_init = \ {ds_repo.pathobj / Path(c['path']).relative_to(ds.pathobj): c for c in content} if ds.pathobj != ds_repo.pathobj else \ {Path(c['path']): c for c in content} content = ds.repo.get_content_annexinfo( # paths are taken from `annex_info_init` paths=None, init=annex_info_init, ref='HEAD', # this is an expensive operation that is only needed # to perform a warning below, and for more accurate # progress reporting (exclude unavailable content). # limit to cases with explicit paths provided eval_availability=True if got_path_arg else False, ) # figure out which of the reported content (after evaluating # `since` and `path` arguments needs transport to_transfer = [ c for c in content.values() # by force if (( force in ('all', 'checkdatapresent') or # or by modification report c.get('state', None) not in ('clean', 'deleted')) # only consider annex'ed files and 'key' in c) ] if got_path_arg: for c in [c for c in to_transfer if not c.get('has_content', False)]: yield dict( res_kwargs, type=c['type'], path=c['path'], action='copy', status='impossible', message='Slated for transport, but no content present', ) cmd = [ 'git', 'annex', 'copy', '--batch', '-z', '--to', target, '--json', '--json-error-messages', '--json-progress' ] if jobs: cmd.extend(['--jobs', str(jobs)]) # Since we got here - we already have some data != "nothing" if (data == 'auto') or \ ( (data == 'auto-if-wanted') and ds_repo.get_preferred_content('wanted', target) ): lgr.debug("Invoking copy --auto") cmd.append('--auto') if force not in ('all', 'checkdatapresent'): # if we force, we do not trust local knowledge and do the checks cmd.append('--fast') lgr.debug("Push data from %s to '%s'", ds, target) # input has type=dataset, but now it is about files res_kwargs.pop('type', None) # produce final path list. use knowledge that annex command will # run in the root of the dataset and compact paths to be relative # to this location # XXX must not be a SpooledTemporaryFile -- dunno why, but doesn't work # otherwise with TemporaryFile() as file_list: nbytes = 0 for c in to_transfer: file_list.write(bytes(Path(c['path']).relative_to(ds.pathobj))) file_list.write(b'\0') nbytes += c['bytesize'] # rewind stdin buffer file_list.seek(0) # tailor the progress protocol with the total number of files # to be transferred class TailoredPushAnnexJsonProtocol(AnnexJsonProtocol): total_nbytes = nbytes # and go # TODO try-except and yield what was captured before the crash #res = GitWitlessRunner( res = GitWitlessRunner(cwd=ds.path, ).run( cmd, # TODO report how many in total, and give global progress too protocol=TailoredPushAnnexJsonProtocol, stdin=file_list) for c in ('stdout', 'stderr'): if res[c]: lgr.debug('Received unexpected %s from `annex copy`: %s', c, res[c]) for j in res['stdout_json']: yield annexjson2result(j, ds, type='file', **res_kwargs) return