def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') orig_state = _check_auto_save(ds, orig_state) # tainted: staged with open(opj(ds.path, 'staged'), 'w') as f: f.write('some') ds.repo.add('staged', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) ok_clean_git(ds.path) # subdataset must be added as a submodule! assert_equal(ds.get_subdatasets(), ['subds'])
def test_dirty(path): for mode in _dirty_modes: # does nothing without a dataset handle_dirty_dataset(None, mode) # placeholder, but not yet created ds = Dataset(path) # unknown mode assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP') # not yet created is very dirty assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before') # should yield a clean repo ds.create() orig_state = ds.repo.get_hexsha() _check_all_clean(ds, orig_state) # tainted: untracked with open(opj(ds.path, 'something'), 'w') as f: f.write('some') orig_state = _check_auto_save(ds, orig_state) # tainted: staged with open(opj(ds.path, 'staged'), 'w') as f: f.write('some') ds.repo.add('staged', git=True) orig_state = _check_auto_save(ds, orig_state) # tainted: submodule # not added to super on purpose! subds = ds.create('subds') _check_all_clean(subds, subds.repo.get_hexsha()) ok_clean_git(ds.path) # subdataset must be added as a submodule! assert_equal(ds.get_subdatasets(), ['subds'])
def _check_auto_save(ds, orig_state): handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'save-before') state = ds.repo.get_hexsha() assert_not_equal(orig_state, state) _check_all_clean(ds, state) return state
def _check_auto_save(ds, orig_state): handle_dirty_dataset(ds, 'ignore') assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail') handle_dirty_dataset(ds, 'save-before') state = ds.repo.get_hexsha() assert_not_equal(orig_state, state) _check_all_clean(ds, state) return state
def __call__( path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, if_dirty='save-before', save=True, reckless=False, git_opts=None, git_clone_opts=None, annex_opts=None, annex_init_opts=None, jobs=None): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") ## Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs, but now they # have duplicated implementations which differ (e.g. get does not # annex init installed annexes) common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) installed_items = [] failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') handle_dirty_dataset(ds, if_dirty) # switch into scenario without --source: if source is None: # we need to collect URLs and paths to_install = [] to_get = [] for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) common_kwargs['dataset'] = dataset # first install, and then get for s in to_install: lgr.debug("Install passes into install source=%s", s) try: result = Install.__call__( source=s, description=description, if_dirty=if_dirty, save=save, git_clone_opts=git_clone_opts, annex_init_opts=annex_init_opts, **common_kwargs ) installed_items += assure_list(result) except Exception as exc: lgr.warning("Installation of %s has failed: %s", s, exc_str(exc)) failed_items.append(s) if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts try: installed_datasets = Get.__call__( to_get, # description=description, # if_dirty=if_dirty, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts _return_datasets=True, **common_kwargs ) except IncompleteResultsError as exc: exc_str_ = ': ' + exc_str(exc) if exc.results else '' lgr.warning("Some items failed to install: %s", exc_str_) installed_datasets = exc.results failed_items.extend(exc.failed) # compose content_by_ds into result for dspath in installed_datasets: ds_ = Dataset(dspath) if ds_.is_installed(): installed_items.append(ds_) else: lgr.warning("%s was not installed", ds_) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save) if source and path and len(path) > 1: raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use `save` %s".format( path)) # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError( "invalid path argument {}: ({})".format(path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # URL doesn't point to a local something # so we have an actual URL in `path`. Since this is valid as a # single positional argument, `source` has to be None at this # point. if is_datalad_compat_ri(path) and source is None: # we have an actual URL -> this should be the source lgr.debug( "Single argument given to install, that doesn't seem to " "be a local path. " "Assuming the argument identifies a source location.") source = path path = None else: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source = _get_git_url_from_source(source) lgr.debug("Resolved source: {0}".format(source)) # TODO: we probably need to resolve source, if it is a local path; # expandpath, normpath, ... Where exactly is the point to do it? # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset nor target installation path provided. " "Deriving destination path from given source %s", source) path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) # there is no other way -- my intoxicated brain tells me assert(path is not None) lgr.debug("Resolved installation target: {0}".format(path)) destination_dataset = Dataset(path) if destination_dataset.is_installed(): # this should not be, check if this is an error, or a reinstall # from the same source # this is where we would have installed this from candidate_sources = _get_flexible_source_candidates( source, destination_dataset.path) # this is where it was installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in candidate_sources or get_local_file_url(track_url): # TODO: this one breaks "promise" assumptions of the repeated # invocations of install. # yoh thinks that we actually should be the ones to run update # (without merge) after basic # check that it is clean and up-to-date with its super dataset # and if so, not return here but continue with errands (recursive # installation and get_data) so we could provide the same # result if we rerun the same install twice. lgr.info( "%s was already installed from %s. Use `update` to obtain " "latest updates, or `get` or `install` with a path, not URL, " "to (re)fetch data and / or subdatasets", destination_dataset, track_url) return destination_dataset else: raise ValueError("There is already a dataset installed at the " "destination: %s", destination_dataset) ########### # we should know everything necessary by now # actual installation starts ########### # FLOW GUIDE: # four cases: # 1. install into a dataset # 1.1. we install a known subdataset # => git submodule update --init # 1.2. we install an existing repo as a subdataset inplace # => git submodule add + magic # 1.3. we (recursively) try to install implicit subdatasets between # ds and path # 1.4. we install a new subdataset from an explicit source # => git submodule add # 2. we "just" install from an explicit source # => git clone if ds is not None: # FLOW GUIDE: 1. # express the destination path relative to the root of # the dataset relativepath = relpath(path, start=ds.path) if relativepath.startswith(pardir): raise ValueError("installation path outside dataset " "({0})".format(path)) lgr.debug("Resolved installation target relative to dataset " "{0}: {1}".format(ds, relativepath)) # FLOW_GUIDE 1.4. lgr.info("Installing subdataset from '{0}' at: {0}".format( source, relativepath)) destination_dataset = _install_subds_from_flexible_source( ds, relativepath, source, reckless) else: # FLOW GUIDE: 2. lgr.info("Installing dataset at {0} from {1}".format(path, source)) # Currently assuming there is nothing at the target to deal with # and rely on failures raising from the git call ... # We possibly need to consider /.git URL candidate_sources = _get_flexible_source_candidates(source) _clone_from_any_source(candidate_sources, destination_dataset.path) # FLOW GUIDE: All four cases done. if not destination_dataset.is_installed(): # XXX shouldn't we just fail!? (unless some explicit --skip-failing?) lgr.error("Installation failed.") return None _handle_possible_annex_dataset(destination_dataset, reckless) lgr.debug("Installation of %s done.", destination_dataset) if not destination_dataset.is_installed(): # log error and don't report as installed item, but don't raise, # since we might be in a process of recursive installation where # a lot of other datasets can still be installed successfully. lgr.error("Installation of {0} failed.".format(destination_dataset)) else: installed_items.append(destination_dataset) # we need to decrease the recursion limit, relative to # subdatasets now subds_recursion_limit = max(0, recursion_limit - 1) \ if isinstance(recursion_limit, int) \ else recursion_limit # Now, recursive calls: if recursive: if description: # yoh: why? especially if we somehow allow for templating them # with e.g. '%s' to catch the subdataset path lgr.warning("Description can't be assigned recursively.") subs = destination_dataset.get_subdatasets( # yes, it does make sense to combine no recursion with # recursion_limit: when the latter is 0 we get no subdatasets # reported, otherwise we always get the 1st-level subs recursive=False, recursion_limit=recursion_limit, absolute=False) if subs: lgr.debug("Obtaining subdatasets of %s: %s", destination_dataset, subs) kwargs = common_kwargs.copy() kwargs['recursion_limit'] = subds_recursion_limit rec_installed = Get.__call__( subs, # all at once dataset=destination_dataset, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, **kwargs ) # TODO do we want to filter this so `install` only returns # the datasets? if isinstance(rec_installed, list): installed_items.extend(rec_installed) else: installed_items.append(rec_installed) if get_data: lgr.debug("Getting data of {0}".format(destination_dataset)) kwargs = common_kwargs.copy() kwargs['recursive'] = False destination_dataset.get(curdir, **kwargs) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save)
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) # this try-except dance is only to maintain a previous behavior of `drop` # where it did not ValueError, but yielded error status try: ds = require_dataset(dataset, check_installed=True, purpose='dropping content') except ValueError as e: yield dict( status='error', message=str(e), **res_kwargs, ) return if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path content_by_ds = {} for st in Status.__call__( # do not use `ds` to preserve path semantics dataset=dataset, path=path, annex=None, untracked='no', recursive=recursive, recursion_limit=recursion_limit, eval_subdataset_state='no', report_filetype='raw', return_type='generator', result_renderer=None, # yield errors and let caller decide on_failure='ignore'): if st['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield st continue # ignore submodule entries if st.get('type') == 'dataset': if not Dataset(st['path']).is_installed(): continue parentds = st['path'] else: parentds = st['parentds'] cbd = content_by_ds.get(parentds, []) cbd.append(st['path']) content_by_ds[parentds] = cbd # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) for r in _drop_files(ds, content_by_ds[ds_path], check=check, **res_kwargs): yield r
def __call__(path=None, *, dataset=None, recursive=False, check=True, if_dirty='save-before'): # all this command does is to map legacy call to their replacement # with drop() import warnings warnings.warn( "The `uninstall` command is deprecated and will be removed in " "a future release. " "Use the `drop` command for safer operation instead.", DeprecationWarning) reckless = None if not check: # the old uninstall/drop combo had no checks beyond git-annex # key copy redundancy reckless = 'kill' paths_by_ds = None if (reckless == 'kill' and not recursive) or if_dirty != 'ignore': refds = require_dataset(dataset, check_installed=True, purpose='uninstall') # same path resolution that drop will do paths_by_ds, errors = get_paths_by_ds(refds, dataset, ensure_list(path), subdsroot_mode='sub') if reckless == 'kill' and not recursive: # drop requires recursive with kill # check check of the subdatasets to see if it is safe to enable it if all(not len( Dataset(d).subdatasets(state='absent', result_xfm='paths', return_type='list', result_renderer='disabled')) for d in paths_by_ds.keys()): # no dataset has any subdatasets, this is fine to set recursive = True # it has never made sense, but for "compatibility" reasons, and to keep # the "old" implementation slower, even it uses the new implementation if if_dirty != 'ignore': for d in paths_by_ds.keys(): handle_dirty_dataset(Dataset(d), mode=if_dirty) from datalad.api import drop lgr.debug( "Calling " "drop(dataset=%r, path=%r, recursive=%r, what='all', reckless=%r)", dataset, path, recursive, reckless) for res in drop( path=path, dataset=dataset, recursive=recursive, what='all', reckless=reckless, return_type='generator', result_renderer='disabled', # we need to delegate the decision making to this uninstall shim on_failure='ignore'): if res['status'] == 'error': msg, *rest = res["message"] if isinstance(msg, str) and "--reckless availability" in msg: # Avoid confusing datalad-uninstall callers with the new # drop parametrization while uninstall still exists. msg = msg.replace("--reckless availability", "--nocheck") res["message"] = (msg, *rest) yield res return
def __call__(path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset" ) to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory" ) yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message= "can only uninstall datasets (consider the `drop` command)" ) yield ap continue # we only have dataset from here if not ap.get('parentds', None): ap.update( status='error', message= "will not uninstall top-level dataset (consider `remove` command)" ) yield ap continue if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
def __call__( dataset, guess_native_type=False, recursive=False, recursion_limit=None, save=True, if_dirty='save-before'): """ Returns ------- List Any datasets where (updated) aggregated meta data was saved. """ ds = require_dataset( dataset, check_installed=True, purpose='meta data aggregation') modified_ds = [] if ds.id is None: lgr.warning('%s has not configured ID, skipping.', dataset) return modified_ds # make sure we get to an expected state handle_dirty_dataset(ds, if_dirty) # if you want to modify the behavior of get_subdataset() make sure # there is a way to return the subdatasets DEPTH FIRST! ds_meta = {} for subds in ds.subdatasets( fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, bottomup=True, result_xfm='datasets'): subds_relpath = relpath(subds.path, start=ds.path) if subds.id is None: # nothing to worry about, any meta data from below this will be # injected upstairs lgr.debug('skipping non-dataset at %s', subds.path) continue else: lgr.info('aggregating meta data for %s', subds) metapath = opj(subds.path, metadata_basepath) handle_dirty_dataset(subds, if_dirty) # # Phase 1: aggregate the within-dataset meta data, and store # within the dataset # # pull out meta data from subds only (no subdatasets) _within_metadata_store( subds, guess_native_type, metapath) # # Phase 2: store everything that is in the look up and belongs into # this dataset # _dump_submeta(subds, ds_meta, subds_relpath, save, modified_ds) # save state of modified dataset, all we modified has been staged # already # we need to save before extracting to full metadata for upstairs # consumption to get the versions right modified_ds = _save_helper(subds, save, modified_ds) # # Phase 3: obtain all aggregated meta data from this dataset, and # keep in lookup to escalate it upstairs # ds_meta[subds_relpath] = get_metadata( subds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) lgr.info('aggregating meta data for %s', ds) # pull out meta data from parent only (no subdatasets) _within_metadata_store( ds, guess_native_type, opj(ds.path, metadata_basepath)) # and lastly the subdatasets of the parent _dump_submeta(ds, ds_meta, '', save, modified_ds) # everything should be stored somewhere by now assert not len(ds_meta) # save the parent modified_ds = _save_helper(ds, save, modified_ds)
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='uninstall', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `uninstall`: requires at least a path or dataset") to_uninstall = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, action='uninstall', # justification for status: # content need not be uninstalled where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue # upfront sanity and compliance checks # check that we have no top-level datasets and not files to process if ap.get('type') == 'dataset' and \ not ap.get('state', None) == 'absent' and \ path_is_under([ap['path']]): # wants a sequence! ap.update( status='error', message="refusing to uninstall current or parent directory") yield ap continue if not ap.get('type', None) == 'dataset': ap.update( status='impossible', message="can only uninstall datasets (consider the `drop` command)") yield ap continue # we only have dataset from here if not ap.get('parentds', None): # this could be a side-effect of the specific call semantics. # As stated in #1714, we are not really interested in whether # a superdataset was obvious in the call, but only whether there # is a superdataset at all. So let's look for one, and only barf # when there really isn't parentds = Dataset(ap['path']).get_superdataset( datalad_only=False, topmost=False, # unless it is properly registered we have no way of # reinstalling it registered_only=True) if parentds is None: ap.update( status='error', message="will not uninstall top-level dataset (consider `remove` command)") yield ap continue ap['parentds'] = parentds.path if not ap['path'] == refds_path: ap['process_content'] = True to_uninstall.append(ap) # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ap in sorted(to_uninstall, key=lambda x: x['path'], reverse=True): if ap.get('state', None) == 'absent': # already gone continue ds = Dataset(ap['path']) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, if_dirty='save-before', save=True, reckless=False, git_opts=None, git_clone_opts=None, annex_opts=None, annex_init_opts=None, jobs=None): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") ## Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs, but now they # have duplicated implementations which differ (e.g. get does not # annex init installed annexes) common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) installed_items = [] failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') handle_dirty_dataset(ds, if_dirty) # switch into scenario without --source: if source is None: # we need to collect URLs and paths to_install = [] to_get = [] for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) common_kwargs['dataset'] = dataset # first install, and then get for s in to_install: lgr.debug("Install passes into install source=%s", s) try: result = Install.__call__(source=s, description=description, if_dirty=if_dirty, save=save, git_clone_opts=git_clone_opts, annex_init_opts=annex_init_opts, **common_kwargs) installed_items += assure_list(result) except Exception as exc: lgr.warning("Installation of %s has failed: %s", s, exc_str(exc)) failed_items.append(s) if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts try: installed_datasets = Get.__call__( to_get, # description=description, # if_dirty=if_dirty, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts _return_datasets=True, **common_kwargs) except IncompleteResultsError as exc: exc_str_ = ': ' + exc_str(exc) if exc.results else '' lgr.warning("Some items failed to install: %s", exc_str_) installed_datasets = exc.results failed_items.extend(exc.failed) # compose content_by_ds into result for dspath in installed_datasets: ds_ = Dataset(dspath) if ds_.is_installed(): installed_items.append(ds_) else: lgr.warning("%s was not installed", ds_) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save) if source and path and len(path) > 1: raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use `save` %s". format(path)) # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # URL doesn't point to a local something # so we have an actual URL in `path`. Since this is valid as a # single positional argument, `source` has to be None at this # point. if is_datalad_compat_ri(path) and source is None: # we have an actual URL -> this should be the source lgr.debug( "Single argument given to install, that doesn't seem to " "be a local path. " "Assuming the argument identifies a source location.") source = path path = None else: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source = _get_git_url_from_source(source) lgr.debug("Resolved source: {0}".format(source)) # TODO: we probably need to resolve source, if it is a local path; # expandpath, normpath, ... Where exactly is the point to do it? # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset nor target installation path provided. " "Deriving destination path from given source %s", source) path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) # there is no other way -- my intoxicated brain tells me assert (path is not None) lgr.debug("Resolved installation target: {0}".format(path)) destination_dataset = Dataset(path) if destination_dataset.is_installed(): # this should not be, check if this is an error, or a reinstall # from the same source # this is where we would have installed this from candidate_sources = _get_flexible_source_candidates( source, destination_dataset.path) # this is where it was installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in candidate_sources or get_local_file_url(track_url): # TODO: this one breaks "promise" assumptions of the repeated # invocations of install. # yoh thinks that we actually should be the ones to run update # (without merge) after basic # check that it is clean and up-to-date with its super dataset # and if so, not return here but continue with errands (recursive # installation and get_data) so we could provide the same # result if we rerun the same install twice. lgr.info( "%s was already installed from %s. Use `update` to obtain " "latest updates, or `get` or `install` with a path, not URL, " "to (re)fetch data and / or subdatasets", destination_dataset, track_url) return destination_dataset else: raise ValueError( "There is already a dataset installed at the " "destination: %s", destination_dataset) ########### # we should know everything necessary by now # actual installation starts ########### # FLOW GUIDE: # four cases: # 1. install into a dataset # 1.1. we install a known subdataset # => git submodule update --init # 1.2. we install an existing repo as a subdataset inplace # => git submodule add + magic # 1.3. we (recursively) try to install implicit subdatasets between # ds and path # 1.4. we install a new subdataset from an explicit source # => git submodule add # 2. we "just" install from an explicit source # => git clone if ds is not None: # FLOW GUIDE: 1. # express the destination path relative to the root of # the dataset relativepath = relpath(path, start=ds.path) if relativepath.startswith(pardir): raise ValueError("installation path outside dataset " "({0})".format(path)) lgr.debug("Resolved installation target relative to dataset " "{0}: {1}".format(ds, relativepath)) # FLOW_GUIDE 1.4. lgr.info("Installing subdataset from '{0}' at: {0}".format( source, relativepath)) destination_dataset = _install_subds_from_flexible_source( ds, relativepath, source, reckless) else: # FLOW GUIDE: 2. lgr.info("Installing dataset at {0} from {1}".format(path, source)) # Currently assuming there is nothing at the target to deal with # and rely on failures raising from the git call ... # We possibly need to consider /.git URL candidate_sources = _get_flexible_source_candidates(source) _clone_from_any_source(candidate_sources, destination_dataset.path) # FLOW GUIDE: All four cases done. if not destination_dataset.is_installed(): # XXX shouldn't we just fail!? (unless some explicit --skip-failing?) lgr.error("Installation failed.") return None _handle_possible_annex_dataset(destination_dataset, reckless) lgr.debug("Installation of %s done.", destination_dataset) if not destination_dataset.is_installed(): # log error and don't report as installed item, but don't raise, # since we might be in a process of recursive installation where # a lot of other datasets can still be installed successfully. lgr.error( "Installation of {0} failed.".format(destination_dataset)) else: installed_items.append(destination_dataset) # we need to decrease the recursion limit, relative to # subdatasets now subds_recursion_limit = max(0, recursion_limit - 1) \ if isinstance(recursion_limit, int) \ else recursion_limit # Now, recursive calls: if recursive: if description: # yoh: why? especially if we somehow allow for templating them # with e.g. '%s' to catch the subdataset path lgr.warning("Description can't be assigned recursively.") subs = destination_dataset.get_subdatasets( # yes, it does make sense to combine no recursion with # recursion_limit: when the latter is 0 we get no subdatasets # reported, otherwise we always get the 1st-level subs recursive=False, recursion_limit=recursion_limit, absolute=False) if subs: lgr.debug("Obtaining subdatasets of %s: %s", destination_dataset, subs) kwargs = common_kwargs.copy() kwargs['recursion_limit'] = subds_recursion_limit rec_installed = Get.__call__( subs, # all at once dataset=destination_dataset, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, **kwargs) # TODO do we want to filter this so `install` only returns # the datasets? if isinstance(rec_installed, list): installed_items.extend(rec_installed) else: installed_items.append(rec_installed) if get_data: lgr.debug("Getting data of {0}".format(destination_dataset)) kwargs = common_kwargs.copy() kwargs['recursive'] = False destination_dataset.get(curdir, **kwargs) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save)
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset") refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert(not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): refds = require_dataset(dataset, check_installed=True, purpose='uninstall') res_kwargs = dict(action='uninstall', logger=lgr, refds=refds.path) if not path: # if no path is given, ie. refds is supposed to be uninstalled # check if refds is a subdataset itself, if not die # we only need to test that for the refds, everything else # will be guaranteed to be a subdataset parentds = refds.get_superdataset( datalad_only=False, topmost=False, # unless it is properly registered we have no way of # reinstalling it registered_only=True) if parentds is None: yield dict( res_kwargs, path=refds.path, type='dataset', status='error', message="will not uninstall top-level dataset " "(consider `remove` command)", ) return saw_subds = False for ds in itertools.chain(Subdatasets.__call__( # it is critical to pass the dataset arg as-is # to not invalidate the path argument semantics # in subdatasets() dataset=dataset, path=path, fulfilled=True, # makes no sense to ignore subdatasets further down recursive=True, # important to start at the bottom for proper deinit bottomup=True, # doesn't make sense for uninstall #recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if path or recursive else [], [refds] if not path else []): if ds != refds: saw_subds = True # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # we confirmed the super dataset presence above for r in _uninstall_dataset(ds, check=check, has_super=True, **res_kwargs): yield r # there is nothing to save at the end if path and not saw_subds: lgr.warning( 'path constraints did not match an installed subdataset: %s', path)
def _check_all_clean(ds, state): assert state is not None for mode in _dirty_modes: # nothing wrong, nothing saved handle_dirty_dataset(ds, mode) assert_equal(state, ds.repo.get_hexsha())
def __call__( path=None, dataset=None, remove_data=True, remove_handles=False, recursive=False, remove_history=False, check=True, kill=False, if_dirty='save-before'): # upfront check prior any resolution attempt to avoid disaster if path is None and dataset is None: raise InsufficientArgumentsError( "insufficient information for uninstallation (needs at " "least a dataset or a path. To uninstall an entire dataset " "it needs to be given explicitly.") if remove_history and not remove_handles: raise ValueError("`remove_history` flag, requires `remove_handles` flag") if not remove_data and not remove_handles: raise ValueError("instructed to neither drop data, nor remove handles: cannot perform") path, dataset_path = get_normalized_path_arguments( path, dataset, default=curdir) results = [] if kill: lgr.warning("Force-removing %d paths", len(path)) for p in path: rmtree(p) results.append(p) return results ds = require_dataset( dataset, check_installed=True, purpose='uninstall') # make sure we get to an expected state handle_dirty_dataset(ds, if_dirty) # sort paths into the respective datasets that contain them # considering 1st-level subdatasets at most # NOTE: little dance with two dicts is necessary, because ATM our # Datasets are not hashable enough for PY3 whocares_paths = {} whocares_ds = {} pwd = getpwd() for p in path: if remove_handles: # behave like `rm -r` and refuse to remove where we are rpath = relpath(p, start=pwd) if rpath == os.curdir \ or rpath == os.pardir \ or set(psplit(rpath)) == {os.pardir}: raise ValueError( "refusing to remove current or parent directory") containerds = ds.get_containing_subdataset(p, recursion_limit=1) if not recursive and containerds.path != ds.path: raise ValueError( "will not uninstall content in subdatasets without the recursive flag") ps = whocares_paths.get(containerds.path, []) ps.append(p) whocares_paths[containerds.path] = ps whocares_ds[containerds.path] = containerds ds_gonealready = False if ds.path in whocares_paths: # start with the content of this dataset, as any somewhat # total recursive removal here would have most impact lgr.debug("Uninstall content in {}".format(ds)) res, ds_gonealready = _uninstall( whocares_ds[ds.path], whocares_paths[ds.path], check=check, remove_history=remove_history, remove_data=remove_data, remove_handles=remove_handles, recursive=recursive) results.extend(res) if ds_gonealready: rmtree(ds.path) # the underlying repo is gone, the assert makes sure that the Dataset # instance becomes aware of that assert(not ds.is_installed()) return results # otherwise deal with any other subdataset for subdspath in whocares_paths: subds = whocares_ds[subdspath] subdsrelpath = relpath(subdspath, start=ds.path) if subds == ds: continue res, subds_gone = _uninstall( subds, whocares_paths[subdspath], check=check, remove_history=remove_history, remove_data=remove_data, remove_handles=remove_handles, recursive=recursive) results.extend(res) if subds_gone: # clean divorce, if we lost the subds in the process # find the submodule that matches the patch # regular access goes by name, but we cannot trust # our own consistency, yet submodule = [sm for sm in ds.repo.repo.submodules if sm.path == subdsrelpath][0] submodule.remove() elif remove_handles: # we could have removed handles -> save Save.__call__( message='[DATALAD] uninstalled content', dataset=subds, auto_add_changes=False, recursive=False) # add this change to the parent, but don't save, will do in # one go below ds.repo.add(subdsrelpath, git=True) if remove_handles: # something of the original dataset is left at this point # and all subdatasets have been saved already # -> save changes Save.__call__( message='[DATALAD] uninstalled content', dataset=ds, auto_add_changes=False, recursive=False) return results
def _check_all_clean(ds, state): assert state is not None for mode in _dirty_modes: # nothing wrong, nothing saved handle_dirty_dataset(ds, mode) assert_equal(state, ds.repo.get_hexsha())
def __call__( dataset, guess_native_type=False, recursive=False, recursion_limit=None, save=True, if_dirty='save-before'): ds = require_dataset( dataset, check_installed=True, purpose='meta data aggregation') modified_ds = [] if ds.id is None: lgr.warning('%s has not configured ID, skipping.', dataset) return modified_ds # make sure we get to an expected state handle_dirty_dataset(ds, if_dirty) # if you want to modify the behavior of get_subdataset() make sure # there is a way to return the subdatasets DEPTH FIRST! ds_meta = {} for subds_path in ds.get_subdatasets( fulfilled=True, absolute=False, recursive=recursive, recursion_limit=recursion_limit): subds = Dataset(opj(ds.path, subds_path)) if subds.id is None: # nothing to worry about, any meta data from below this will be # injected upstairs lgr.debug('skipping non-dataset at %s', subds.path) continue else: lgr.info('aggregating meta data for %s', subds) metapath = opj(subds.path, metadata_basepath) handle_dirty_dataset(subds, if_dirty) # # Phase 1: aggregate the within-dataset meta data, and store # within the dataset # # pull out meta data from subds only (no subdatasets) _within_metadata_store( subds, guess_native_type, metapath) # # Phase 2: store everything that is in the look up and belongs into # this dataset # _dump_submeta(subds, ds_meta, subds_path, save, modified_ds) # save state of modified dataset, all we modified has been staged # already # we need to save before extracting to full metadata for upstairs # consumption to get the versions right modified_ds = _save_helper(subds, save, modified_ds) # # Phase 3: obtain all aggregated meta data from this dataset, and # keep in lookup to escalate it upstairs # ds_meta[subds_path] = get_metadata( subds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) lgr.info('aggregating meta data for %s', ds) # pull out meta data from parent only (no subdatasets) _within_metadata_store( ds, guess_native_type, opj(ds.path, metadata_basepath)) # and lastly the subdatasets of the parent _dump_submeta(ds, ds_meta, '', save, modified_ds) # everything should be stored somewhere by now assert not len(ds_meta) # save the parent modified_ds = _save_helper(ds, save, modified_ds)
def __call__(path=None, source=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, if_dirty='ignore', git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path and not source: raise InsufficientArgumentsError( "insufficient information for " "adding: requires at least a path " "or a source.") # When called from cmdline `path` and `source` will be a list even if # there is only one item. # Make sure we deal with the same when called via python API: # always yields list; empty if None path = assure_list(path) source = assure_list(source) # TODO: Q: are the list operations in the following 3 blocks (resolving # paths, sources and datasets) guaranteed to be stable # regarding order? # resolve path(s): # TODO: RF: resolve_path => datalad.utils => more general (repos => normalize paths) resolved_paths = [resolve_path(p, dataset) for p in path] # must come after resolve_path()!! # resolve dataset: dataset = require_dataset(dataset, check_installed=True, purpose='adding') handle_dirty_dataset(dataset, if_dirty) # resolve source(s): resolved_sources = [] for s in source: if not is_datalad_compat_ri(s): raise ValueError("invalid source parameter: %s" % s) resolved_sources.append(_get_git_url_from_source(s)) # find (sub-)datasets to add things to (and fail on invalid paths): if recursive: # 1. Find the (sub-)datasets containing the given path(s): # Note, that `get_containing_subdataset` raises if `p` is # outside `dataset`, but it returns `dataset`, if `p` is inside # a subdataset not included by `recursion_limit`. In the latter # case, the git calls will fail instead. # We could check for this right here and fail early, but this # would lead to the need to discover the entire hierarchy no # matter if actually required. resolved_datasets = [ dataset.get_containing_subdataset( p, recursion_limit=recursion_limit) for p in resolved_paths ] # 2. Find implicit subdatasets to call add on: # If there are directories in resolved_paths (Note, # that this includes '.' and '..'), check for subdatasets # beneath them. These should be called recursively with '.'. # Therefore add the subdatasets to resolved_datasets and # corresponding '.' to resolved_paths, in order to generate the # correct call. for p in resolved_paths: if isdir(p): for subds_path in \ dataset.get_subdatasets(absolute=True, recursive=True, recursion_limit=recursion_limit): if subds_path.startswith(_with_sep(p)): resolved_datasets.append(Dataset(subds_path)) resolved_paths.append(curdir) else: # if not recursive, try to add everything to dataset itself: resolved_datasets = [dataset for i in range(len(resolved_paths))] # we need a resolved dataset per path: assert len(resolved_paths) == len(resolved_datasets) # sort parameters for actual git/git-annex calls: # (dataset, path, source) from six.moves import zip_longest param_tuples = list( zip_longest(resolved_datasets, resolved_paths, resolved_sources)) # possible None-datasets in `param_tuples` were filled in by zip_longest # and need to be replaced by `dataset`: param_tuples = [(d if d is not None else dataset, p, s) for d, p, s in param_tuples] calls = { d.path: { # list of paths to 'git-add': 'g_add': [], # list of paths to 'git-annex-add': 'a_add': [], # list of sources to 'git-annex-addurl': 'addurl_s': [], # list of (path, source) to # 'git-annex-addurl --file': 'addurl_f': [] } for d in [i for i, p, s in param_tuples] } for ds, p, s in param_tuples: # it should not happen, that `path` as well as `source` are None: assert p or s if not s: # we have a path only # Do not try to add to annex whenever there is no annex if to_git or not isinstance(ds.repo, AnnexRepo): calls[ds.path]['g_add'].append(p) else: calls[ds.path]['a_add'].append(p) elif not p: # we have a source only if to_git: raise NotImplementedError("Can't add a remote source " "directly to git.") calls[ds.path]['addurl_s'].append(s) else: # we have a path and a source if to_git: raise NotImplementedError("Can't add a remote source " "directly to git.") calls[ds.path]['addurl_f'].append((p, s)) # now do the actual add operations: # TODO: implement git/git-annex/git-annex-add options datasets_return_values = defaultdict(list) for dspath in calls: ds = Dataset(dspath) return_values = datasets_return_values[dspath] lgr.info("Processing dataset %s ..." % ds) # check every (sub-)dataset for annex once, since we can't add or # addurl anything, if there is no annex: # TODO: Q: Alternatively, just call git-annex-init if there's no # annex yet and we have an annex-add/annex-addurl request? _is_annex = isinstance(ds.repo, AnnexRepo) if calls[ds.path]['g_add']: lgr.debug("Adding %s to git", calls[dspath]['g_add']) added = ds.repo.add(calls[dspath]['g_add'], git=True, git_options=git_opts) return_values.extend(added) if calls[ds.path]['a_add']: if _is_annex: lgr.debug("Adding %s to annex", calls[dspath]['a_add']) return_values.extend( ds.repo.add(calls[dspath]['a_add'], git=False, jobs=jobs, git_options=git_opts, annex_options=annex_opts, options=annex_add_opts)) else: lgr.debug("{0} is no annex. Skip 'annex-add' for " "files {1}".format(ds, calls[dspath]['a_add'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['a_add']]) # TODO: AnnexRepo.add_urls' return value doesn't contain the created # file name but the url if calls[ds.path]['addurl_s']: if _is_annex: lgr.debug("Adding urls %s to annex", calls[dspath]['addurl_s']) return_values.extend( ds.repo.add_urls( calls[ds.path]['addurl_s'], options=annex_add_opts, # TODO: extra parameter for addurl? git_options=git_opts, annex_options=annex_opts, jobs=jobs, )) else: lgr.debug("{0} is no annex. Skip 'annex-addurl' for " "files {1}".format(ds, calls[dspath]['addurl_s'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['addurl_s']]) if calls[ds.path]['addurl_f']: if _is_annex: for f, u in calls[ds.path]['addurl_f']: lgr.debug("Adding urls %s to files in annex", calls[dspath]['addurl_f']) return_values.append( ds.repo.add_url_to_file( f, u, options=annex_add_opts, # TODO: see above git_options=git_opts, annex_options=annex_opts, batch=True)) else: lgr.debug("{0} is no annex. Skip 'annex-addurl' for " "files {1}".format(ds, calls[dspath]['addurl_f'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['addurl_f']]) return_values = None # to avoid mis-use # XXX or we could return entire datasets_return_values, could be useful # that way. But then should be unified with the rest of commands, e.g. # get etc return_values_flat = [] for dspath, return_values in datasets_return_values.items(): if save and len(return_values): # we got something added -> save # everything we care about at this point should be staged already Save.__call__(message='[DATALAD] added content', dataset=ds, auto_add_changes=False, recursive=False) # TODO: you feels that this is some common logic we already have somewhere dsrelpath = relpath(dspath, dataset.path) if dsrelpath != curdir: # we need ot adjust 'file' entry in each record for return_value in return_values: if 'file' in return_value: return_value['file'] = opj(dsrelpath, return_value['file']) return_values_flat.append(return_value) else: return_values_flat.extend(return_values) return return_values_flat
def __call__(path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if not dataset and not path: raise InsufficientArgumentsError( "insufficient information for `drop`: requires at least a path or dataset" ) refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='drop', logger=lgr, refds=refds_path) if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path to_drop = [] for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, recursive=recursive, recursion_limit=recursion_limit, action='drop', # justification for status: # content need not be dropped where there is none unavailable_path_status='notneeded', nondataset_path_status='error', return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: ap['process_content'] = True if ap.get('registered_subds', False) and ap.get('state', None) == 'absent': # nothing to drop in an absent subdataset, don't be annoying # and skip silently continue to_drop.append(ap) content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_drop, refds_path=refds_path) assert (not completed) # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) # TODO generator # this should yield what it did handle_dirty_dataset(ds, mode=if_dirty) # ignore submodule entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: continue for r in _drop_files(ds, content, check=check, **res_kwargs): yield r
def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, if_dirty='save-before', shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) # straight from input arg, no messing around before this if path is None: if dataset is None: # nothing given explicity, assume create fresh right here path = getpwd() else: # no path, but dataset -> create that dataset path = dataset.path else: # resolve the path against a potential dataset path = resolve_path(path, ds=dataset) # we know that we need to create a dataset at `path` assert (path is not None) if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # check for sane subdataset path real_targetpath = with_pathsep(realpath(path)) # realpath OK if dataset is not None: # make sure we get to an expected state if dataset.is_installed(): handle_dirty_dataset(dataset, if_dirty) if not real_targetpath.startswith( # realpath OK with_pathsep(realpath(dataset.path))): # realpath OK raise ValueError("path {} outside {}".format(path, dataset)) # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if dataset is not None and dataset.path == path else Dataset( path) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: raise ValueError("Cannot create dataset in directory %s " "(not empty). Use option 'force' in order to " "ignore this and enforce creation." % tbds.path) if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # save everthing tbds.repo.add('.datalad', git=True) if save: Save.__call__(message='[DATALAD] new dataset', dataset=tbds, auto_add_changes=False, recursive=False) if dataset is not None and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule from datalad.distribution.utils import _install_subds_inplace subdsrelpath = relpath(realpath(tbds.path), realpath(dataset.path)) # realpath OK _install_subds_inplace(ds=dataset, path=tbds.path, relativepath=subdsrelpath) # this will have staged the changes in the superdataset already if save: Save.__call__(message='[DATALAD] added subdataset', dataset=dataset, auto_add_changes=False, recursive=False) return tbds