def _create_subds_from_tarball(tarball, targetdir): filename = op.basename(tarball) importds = Dataset(op.join(targetdir, "dicoms")).create( return_type='item-or-list', result_xfm='datasets', result_filter=EnsureKeyChoice('action', ('create',)) \ & EnsureKeyChoice('status', ('ok', 'notneeded')) ) _import_dicom_tarball(importds, tarball, filename) importds.config.add(var="datalad.metadata.nativetype", value="dicom", where="dataset") importds.config.add(var="datalad.metadata.aggregate-content-dicom", value='false', where="dataset") # TODO: file an issue: config.add can't convert False to 'false' on its own # (But vice versa while reading IIRC) importds.config.add(var="datalad.metadata.maxfieldsize", value='10000000', where="dataset") importds.save(op.join(".datalad", "config"), message="[HIRNI] initial config for DICOM metadata") return importds
def test_result_filter(): # ensure baseline without filtering assert_equal([r['somekey'] for r in TestUtils().__call__(4)], [0, 1, 2, 3]) # test two functionally equivalent ways to filter results # 1. Constraint-based -- filter by exception # we have a full set of AND and OR operators for this # 2. custom filer function -- filter by boolean return value for filt in (EnsureKeyChoice('somekey', (0, 2)), lambda x: x['somekey'] in (0, 2)): assert_equal([ r['somekey'] for r in TestUtils().__call__(4, result_filter=filt) ], [0, 2]) # constraint returns full dict assert_dict_equal(TestUtils().__call__(4, result_filter=filt)[-1], { 'action': 'off', 'path': 'some', 'status': 'ok', 'somekey': 2 }) # test more sophisticated filters that actually get to see the # API call's kwargs def greatfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), 'awesome') return True TestUtils().__call__(4, dataset='awesome', result_filter=greatfilter) def sadfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), None) return True TestUtils().__call__(4, result_filter=sadfilter)
def _get_result_filter(cls, args): from datalad import cfg result_filter = None if args.common_report_status or 'datalad.runtime.report-status' in cfg: report_status = args.common_report_status or \ cfg.obtain('datalad.runtime.report-status') if report_status == 'success': result_filter = EnsureKeyChoice('status', ('ok', 'notneeded')) elif report_status == 'failure': result_filter = EnsureKeyChoice('status', ('impossible', 'error')) else: result_filter = EnsureKeyChoice('status', (report_status, )) if args.common_report_type: tfilt = EnsureKeyChoice('type', tuple(args.common_report_type)) result_filter = result_filter & tfilt if result_filter else tfilt return result_filter
class Clone(Interface): """Obtain a dataset (copy) from a URL or local directory The purpose of this command is to obtain a new clone (copy) of a dataset and place it into a not-yet-existing or empty directory. As such `clone` provides a strict subset of the functionality offered by `install`. Only a single dataset can be obtained, and immediate recursive installation of subdatasets is not supported. However, once a (super)dataset is installed via `clone`, any content, including subdatasets can be obtained by a subsequent `get` command. Primary differences over a direct `git clone` call are 1) the automatic initialization of a dataset annex (pure Git repositories are equally supported); 2) automatic registration of the newly obtained dataset as a subdataset (submodule), if a parent dataset is specified; 3) support for additional resource identifiers (DataLad resource identifiers as used on datasets.datalad.org, and RIA store URLs as used for store.datalad.org - optionally in specific versions as identified by a branch or a tag; see examples); and 4) automatic configurable generation of alternative access URL for common cases (such as appending '.git' to the URL in case the accessing the base URL failed). || PYTHON >>By default, the command returns a single Dataset instance for an installed dataset, regardless of whether it was newly installed ('ok' result), or found already installed from the specified source ('notneeded' result).<< PYTHON || .. seealso:: :ref:`handbook:3-001` More information on Remote Indexed Archive (RIA) stores """ # by default ignore everything but install results # i.e. no "add to super dataset" result_filter = EnsureKeyChoice('action', ('install',)) # very frequently this command will yield exactly one installed dataset # spare people the pain of going through a list by default return_type = 'item-or-list' # as discussed in #1409 and #1470, we want to return dataset instances # matching what is actually available after command completion (and # None for any failed dataset installation) result_xfm = 'successdatasets-or-none' _examples_ = [ dict(text="Install a dataset from Github into the current directory", code_py="clone(" "source='https://github.com/datalad-datasets/longnow" "-podcasts.git')", code_cmd="datalad clone " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install a dataset into a specific directory", code_py="""\ clone(source='https://github.com/datalad-datasets/longnow-podcasts.git', path='myfavpodcasts')""", code_cmd="""\ datalad clone https://github.com/datalad-datasets/longnow-podcasts.git \\ myfavpodcasts"""), dict(text="Install a dataset as a subdataset into the current dataset", code_py="""\ clone(dataset='.', source='https://github.com/datalad-datasets/longnow-podcasts.git')""", code_cmd="datalad clone -d . " "https://github.com/datalad-datasets/longnow-podcasts.git"), dict(text="Install the main superdataset from datasets.datalad.org", code_py="clone(source='///')", code_cmd="datalad clone ///"), dict(text="Install a dataset identified by a literal alias from store.datalad.org", code_py="clone(source='ria+http://store.datalad.org#~hcp-openaccess')", code_cmd="datalad clone ria+http://store.datalad.org#~hcp-openaccess"), dict( text="Install a dataset in a specific version as identified by a " "branch or tag name from store.datalad.org", code_py="clone(source='ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier')", code_cmd="datalad clone ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier"), dict( text="Install a dataset with group-write access permissions", code_py=\ "clone(source='http://example.com/dataset', reckless='shared-group')", code_cmd=\ "datalad clone http://example.com/dataset --reckless shared-group"), ] _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""(parent) dataset to clone into. If given, the newly cloned dataset is registered as a subdataset of the parent. Also, if given, relative paths are interpreted as being relative to the parent dataset, and not relative to the working directory.""", constraints=EnsureDataset() | EnsureNone()), source=Parameter( args=("source",), metavar='SOURCE', doc="""URL, DataLad resource identifier, local path or instance of dataset to be cloned""", constraints=EnsureStr() | EnsureNone()), path=Parameter( args=("path",), metavar='PATH', nargs="?", doc="""path to clone into. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), description=location_description, reckless=reckless_opt, ) @staticmethod @datasetmethod(name='clone') @eval_results def __call__( source, path=None, dataset=None, description=None, reckless=None): # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = ds.path if ds else None # legacy compatibility if reckless is True: # so that we can forget about how things used to be reckless = 'auto' if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `save`".format( path)) if path is not None: path = resolve_path(path, dataset) # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue # since this is a relative `path`, resolve it: # we are not going to reuse the decoded URL, as this is done for # all source candidates in clone_dataset(), we just use to determine # a destination path here in order to perform a bunch of additional # checks that shall not pollute the helper function source_ = decode_source_spec( source, cfg=None if ds is None else ds.config) path = resolve_path(source_['default_destpath'], dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) result_props = dict( action='install', logger=lgr, refds=refds_path, source_url=source) try: # this will implicitly cause pathlib to run a bunch of checks # whether the present path makes any sense on the platform # we are running on -- we don't care if the path actually # exists at this point, but we want to abort early if the path # spec is determined to be useless path.exists() except OSError as e: yield get_status_dict( status='error', path=path, message=('cannot handle target path: %s', exc_str(e)), **result_props) return destination_dataset = Dataset(path) result_props['ds'] = destination_dataset if ds is not None and ds.pathobj not in path.parents: yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, ds), **result_props) return # perform the actual cloning operation yield from clone_dataset( [source], destination_dataset, reckless, description, result_props, cfg=None if ds is None else ds.config, ) # TODO handle any 'version' property handling and verification using a dedicated # public helper if ds is not None: # we created a dataset in another dataset # -> make submodule for r in ds.save( path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r
class Create(Interface): """Create a new dataset from scratch. This command initializes a new :term:`dataset` at a given location, or the current directory. The new dataset can optionally be registered in an existing :term:`superdataset` (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. This command only creates a new dataset, it does not add any content to it, even if the target directory already contains additional files or directories. Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init`, and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # in general this command will yield exactly one result return_type = 'item-or-list' # in general users expect to get an instance of the created dataset result_xfm = 'datasets' # result filter result_filter = EnsureKeyChoice('action', ('create',)) & \ EnsureKeyChoice('status', ('ok', 'notneeded')) _params_ = dict( path=Parameter( args=("path", ), metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the current working directory. Either way the command will error if the target directory is not empty. Use `force` to create a dataset in a non-empty directory.""", nargs='?', # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), dataset=Parameter( args=("-d", "--dataset"), metavar='PATH', doc="""specify the dataset to perform the create operation on. If a dataset is give, a new subdataset will be created in it.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=location_description, # TODO could move into cfg_annex plugin no_annex=Parameter( args=("--no-annex", ), doc="""if set, a plain Git repository will be created without any annex""", action='store_true'), text_no_annex=Parameter( args=("--text-no-annex", ), doc="""if set, all text files in the future would be added to Git, not annex. Achieved by adding an entry to `.gitattributes` file. See http://git-annex.branchable.com/tips/largefiles/ and `no_annex` DataLad plugin to establish even more detailed control over which files are placed under annex control.""", action='store_true'), save=nosave_opt, # TODO could move into cfg_annex plugin annex_version=Parameter( args=("--annex-version", ), doc="""select a particular annex repository version. The list of supported versions depends on the available git-annex version. This should be left untouched, unless you know what you are doing""", constraints=EnsureDType(int) | EnsureNone()), # TODO could move into cfg_annex plugin annex_backend=Parameter( args=("--annex-backend", ), constraints=EnsureStr() | EnsureNone(), # not listing choices here on purpose to avoid future bugs doc="""set default hashing backend used by the new dataset. For a list of supported backends see the git-annex documentation. The default is optimized for maximum compatibility of datasets across platforms (especially those with limited path lengths)"""), # TODO could move into cfg_metadata plugin native_metadata_type=Parameter( args=('--native-metadata-type', ), metavar='LABEL', action='append', constraints=EnsureStr() | EnsureNone(), doc="""Metadata type label. Must match the name of the respective parser implementation in DataLad (e.g. "bids").[CMD: This option can be given multiple times CMD]"""), # TODO could move into cfg_access/permissions plugin shared_access=shared_access_opt, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts, ) @staticmethod @datasetmethod(name='create') @eval_results def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit("Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file]) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid.uuid1().urn.split(':')[-1], where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate meta data # comes around gitattr.write( '# Text files (according to file --mime-type) are added directly to git.\n' ) gitattr.write( '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n' ) gitattr.write('** annex.largefiles=nothing\n') # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if save and isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path @staticmethod def custom_result_renderer(res, **kwargs): from datalad.ui import ui if res.get('action', None) == 'create' and \ res.get('status', None) == 'ok' and \ res.get('type', None) == 'dataset': ui.message("Created dataset at {}.".format(res['path'])) else: ui.message("Nothing was created")
class Clone(Interface): """Obtain a dataset copy from a URL or local source (path) The purpose of this command is to obtain a new clone (copy) of a dataset and place it into a not-yet-existing or empty directory. As such `clone` provides a strict subset of the functionality offered by `install`. Only a single dataset can be obtained, recursion is not supported. However, once installed, arbitrary dataset components can be obtained via a subsequent `get` command. Primary differences over a direct `git clone` call are 1) the automatic initialization of a dataset annex (pure Git repositories are equally supported); 2) automatic registration of the newly obtained dataset as a subdataset (submodule), if a parent dataset is specified; 3) support for datalad's resource identifiers and automatic generation of alternative access URL for common cases (such as appending '.git' to the URL in case the accessing the base URL failed); and 4) ability to take additional alternative source locations as an argument. """ # by default ignore everything but install results # i.e. no "add to super dataset" result_filter = EnsureKeyChoice('action', ('install', )) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""(parent) dataset to clone into. If given, the newly cloned dataset is registered as a subdataset of the parent. Also, if given, relative paths are interpreted as being relative to the parent dataset, and not relative to the working directory.""", constraints=EnsureDataset() | EnsureNone()), source=Parameter( args=("source", ), metavar='SOURCE', doc="""URL, DataLad resource identifier, local path or instance of dataset to be cloned""", constraints=EnsureStr() | EnsureNone()), path=Parameter(args=("path", ), metavar='PATH', nargs="?", doc="""path to clone into. If no `path` is provided a destination path will be derived from a source URL similar to :command:`git clone`"""), description=location_description, reckless=reckless_opt, alt_sources=Parameter( args=('--alternative-sources', ), dest='alt_sources', metavar='SOURCE', nargs='+', doc="""Alternative sources to be tried if a dataset cannot be obtained from the main `source`""", constraints=EnsureStr() | EnsureNone()), # TODO next ones should be there, but cannot go anywhere # git_opts=git_opts, # git_clone_opts=git_clone_opts, # annex_opts=annex_opts, # annex_init_opts=annex_init_opts, ) @staticmethod @datasetmethod(name='clone') @eval_results def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict( ) # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join( GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict(status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
class Create(Interface): """Create a new dataset from scratch. This command initializes a new dataset at a given location, or the current directory. The new dataset can optionally be registered in an existing superdataset (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. This command only creates a new dataset, it does not add existing content to it, even if the target directory already contains additional files or directories. Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init` and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # in general this command will yield exactly one result return_type = 'item-or-list' # in general users expect to get an instance of the created dataset result_xfm = 'datasets' # result filter result_filter = \ EnsureKeyChoice('action', ('create',)) & \ EnsureKeyChoice('status', ('ok', 'notneeded')) _params_ = dict( path=Parameter( args=("path", ), nargs='?', metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the current working directory. Either way the command will error if the target directory is not empty. Use `force` to create a dataset in a non-empty directory.""", # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), initopts=Parameter( args=("initopts", ), metavar='INIT OPTIONS', nargs=REMAINDER, doc="""options to pass to :command:`git init`. [PY: Options can be given as a list of command line arguments or as a GitPython-style option dictionary PY][CMD: Any argument specified after the destination path of the repository will be passed to git-init as-is CMD]. Note that not all options will lead to viable results. For example '--bare' will not yield a repository where DataLad can adjust files in its worktree."""), dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the create operation on. If a dataset is given, a new subdataset will be created in it.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=location_description, no_annex=Parameter( args=("--no-annex", ), doc="""if set, a plain Git repository will be created without any annex""", action='store_true'), # TODO seems to only cause a config flag to be set, this could be done # in a procedure fake_dates=Parameter( args=('--fake-dates', ), action='store_true', doc="""Configure the repository to use fake dates. The date for a new commit will be set to one second later than the latest commit in the repository. This can be used to anonymize dates."""), cfg_proc=Parameter( args=("-c", "--cfg-proc"), metavar="PROC", action='append', doc="""Run cfg_PROC procedure(s) (can be specified multiple times) on the created dataset. Use [PY: `run_procedure(discover=True)` PY][CMD: run_procedure --discover CMD] to get a list of available procedures, such as cfg_text2git. """)) @staticmethod @datasetmethod(name='create') @eval_results def __call__(path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset(refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any(check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict]) }) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo(tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added' } # make sure that v6 annex repos never commit content under .datalad attrs_cfg = (('config', 'annex.largefiles', 'nothing'), ( 'metadata/aggregate*', 'annex.largefiles', 'nothing' ), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes(set_attrs, attrfile=op.join( '.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked' } # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add(id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save(path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r @staticmethod def custom_result_renderer(res, **kwargs): # pragma: no cover from datalad.ui import ui if res.get('action', None) == 'create' and \ res.get('status', None) == 'ok' and \ res.get('type', None) == 'dataset': ui.message("Created dataset at {}.".format(res['path'])) else: ui.message("Nothing was created")
class Create(Interface): """Create a new dataset from scratch. This command initializes a new dataset at a given location, or the current directory. The new dataset can optionally be registered in an existing superdataset (the new dataset's path needs to be located within the superdataset for that, and the superdataset needs to be given explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended to provide a brief description to label the dataset's nature *and* location, e.g. "Michael's music on black laptop". This helps humans to identify data locations in distributed scenarios. By default an identifier comprised of user and machine name, plus path will be generated. This command only creates a new dataset, it does not add existing content to it, even if the target directory already contains additional files or directories. Plain Git repositories can be created via [PY: `annex=False` PY][CMD: --no-annex CMD]. However, the result will not be a full dataset, and, consequently, not all features are supported (e.g. a description). || REFLOW >> To create a local version of a remote dataset use the :func:`~datalad.api.install` command instead. << REFLOW || .. note:: Power-user info: This command uses :command:`git init` and :command:`git annex init` to prepare the new dataset. Registering to a superdataset is performed via a :command:`git submodule add` operation in the discovered superdataset. """ # in general this command will yield exactly one result return_type = 'item-or-list' # in general users expect to get an instance of the created dataset result_xfm = 'datasets' # result filter result_filter = \ EnsureKeyChoice('action', ('create',)) & \ EnsureKeyChoice('status', ('ok', 'notneeded')) _examples_ = [ dict(text="Create a dataset 'mydataset' in the current directory", code_py="create(path='mydataset')", code_cmd="datalad create mydataset"), dict(text="Apply the text2git procedure upon creation of a dataset", code_py="create(path='mydataset', cfg_proc='text2git')", code_cmd="datalad create -c text2git mydataset"), dict(text="Create a subdataset in the root of an existing dataset", code_py="create(dataset='.', path='mysubdataset')", code_cmd="datalad create -d . mysubdataset"), dict(text="Create a dataset in an existing, non-empty directory", code_py="create(force=True)", code_cmd="datalad create --force"), dict(text="Create a plain Git repository", code_py="create(path='mydataset', annex=False)", code_cmd="datalad create --no-annex mydataset"), ] _params_ = dict( path=Parameter( args=("path", ), nargs='?', metavar='PATH', doc="""path where the dataset shall be created, directories will be created as necessary. If no location is provided, a dataset will be created in the location specified by [PY: `dataset` PY][CMD: --dataset CMD] (if given) or the current working directory. Either way the command will error if the target directory is not empty. Use [PY: `force` PY][CMD: --force CMD] to create a dataset in a non-empty directory.""", # put dataset 2nd to avoid useless conversion constraints=EnsureStr() | EnsureDataset() | EnsureNone()), initopts=Parameter( args=("initopts", ), metavar='INIT OPTIONS', nargs=REMAINDER, doc="""options to pass to :command:`git init`. [PY: Options can be given as a list of command line arguments or as a GitPython-style option dictionary PY][CMD: Any argument specified after the destination path of the repository will be passed to git-init as-is CMD]. Note that not all options will lead to viable results. For example '--bare' will not yield a repository where DataLad can adjust files in its working tree."""), dataset=Parameter( args=("-d", "--dataset"), metavar='DATASET', doc="""specify the dataset to perform the create operation on. If a dataset is given along with `path`, a new subdataset will be created in it at the `path` provided to the create command. If a dataset is given but `path` is unspecified, a new dataset will be created at the location specified by this option.""", constraints=EnsureDataset() | EnsureNone()), force=Parameter( args=( "-f", "--force", ), doc="""enforce creation of a dataset in a non-empty directory""", action='store_true'), description=location_description, annex=Parameter( args=("--no-annex", ), dest='annex', doc="""if [CMD: set CMD][PY: disabled PY], a plain Git repository will be created without any annex""", action='store_false'), # TODO seems to only cause a config flag to be set, this could be done # in a procedure fake_dates=Parameter( args=('--fake-dates', ), action='store_true', doc="""Configure the repository to use fake dates. The date for a new commit will be set to one second later than the latest commit in the repository. This can be used to anonymize dates."""), cfg_proc=Parameter( args=("-c", "--cfg-proc"), metavar="PROC", action='append', doc="""Run cfg_PROC procedure(s) (can be specified multiple times) on the created dataset. Use [PY: `run_procedure(discover=True)` PY][CMD: run-procedure --discover CMD] to get a list of available procedures, such as cfg_text2git. """)) @staticmethod @datasetmethod(name='create') @eval_results def __call__(path=None, initopts=None, *, force=False, description=None, dataset=None, annex=True, fake_dates=False, cfg_proc=None): # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = ensure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='create a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`--force` option to ignore' }) yield res return # Check if specified cfg_proc(s) can be discovered, storing # the results so they can be used when the time comes to run # the procedure. If a procedure cannot be found, raise an # error to prevent creating the dataset. cfg_proc_specs = [] if cfg_proc: discovered_procs = tbds.run_procedure( discover=True, result_renderer='disabled', return_type='generator', ) for cfg_proc_ in cfg_proc: for discovered_proc in discovered_procs: if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_: cfg_proc_specs.append(discovered_proc) break else: raise ValueError("Cannot find procedure with name " "'%s'" % cfg_proc_) if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository # also provides initial set of content to be tracked with git (not annex) if no_annex: tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates) else: tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates, description) # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, scope='branch') if _seed is None: # just the standard way # use a fully random identifier (i.e. UUID version 4) uuid_id = str(uuid.uuid4()) else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, scope='branch', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, scope='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_spec in cfg_proc_specs: yield from tbds.run_procedure( cfg_proc_spec, result_renderer='disabled', return_type='generator', ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule yield from refds.save( path=tbds.path, return_type='generator', result_renderer='disabled', ) res.update({'status': 'ok'}) yield res
def call_from_parser(cls, args): # XXX needs safety check for name collisions from inspect import getargspec argspec = getargspec(cls.__call__) if argspec[2] is None: # no **kwargs in the call receiver, pull argnames from signature argnames = getargspec(cls.__call__)[0] else: # common options # XXX define or better get from elsewhere common_opts = ('change_path', 'common_debug', 'common_idebug', 'func', 'help', 'log_level', 'logger', 'pbs_runner', 'result_renderer', 'subparser') argnames = [ name for name in dir(args) if not (name.startswith('_') or name in common_opts) ] kwargs = {k: getattr(args, k) for k in argnames if is_api_arg(k)} # we are coming from the entry point, this is the toplevel command, # let it run like generator so we can act on partial results quicker # TODO remove following condition test when transition is complete and # run indented code unconditionally if cls.__name__ not in ('AddArchiveContent', 'CrawlInit', 'Crawl', 'CreateSiblingGithub', 'CreateTestDataset', 'DownloadURL', 'Export', 'Ls', 'Move', 'SSHRun', 'Test'): # set all common args explicitly to override class defaults # that are tailored towards the the Python API kwargs['return_type'] = 'generator' kwargs['result_xfm'] = None # allow commands to override the default, unless something other than # default is requested kwargs['result_renderer'] = \ args.common_output_format if args.common_output_format != 'default' \ else getattr(cls, 'result_renderer', args.common_output_format) if '{' in args.common_output_format: # stupid hack, could and should become more powerful kwargs['result_renderer'] = \ lambda x, **kwargs: ui.message(args.common_output_format.format( **{k: {k_.replace(':', '#'): v_ for k_, v_ in v.items()} if isinstance(v, dict) else v for k, v in x.items()})) if args.common_on_failure: kwargs['on_failure'] = args.common_on_failure # compose filter function from to be invented cmdline options result_filter = None if args.common_report_status: if args.common_report_status == 'success': result_filter = EnsureKeyChoice('status', ('ok', 'notneeded')) elif args.common_report_status == 'failure': result_filter = EnsureKeyChoice('status', ('impossible', 'error')) else: result_filter = EnsureKeyChoice( 'status', (args.common_report_status, )) if args.common_report_type: tfilt = EnsureKeyChoice('type', tuple(args.common_report_type)) result_filter = result_filter & tfilt if result_filter else tfilt kwargs['result_filter'] = result_filter try: ret = cls.__call__(**kwargs) if inspect.isgenerator(ret): ret = list(ret) if args.common_output_format == 'tailored' and \ hasattr(cls, 'custom_result_summary_renderer'): cls.custom_result_summary_renderer(ret) return ret except KeyboardInterrupt as exc: ui.error("\nInterrupted by user while doing magic: %s" % exc_str(exc)) sys.exit(1)