Beispiel #1
0
    def __call__(types, files=None, dataset=None):
        dataset = require_dataset(dataset or curdir,
                                  purpose="extract metadata",
                                  check_installed=not files)
        if not files:
            ds = require_dataset(dataset, check_installed=True)
            subds = ds.subdatasets(recursive=False, result_xfm='relpaths')
            files = list(_get_metadatarelevant_paths(ds, subds))

        dsmeta, contentmeta, error = _get_metadata(dataset,
                                                   types,
                                                   global_meta=True,
                                                   content_meta=bool(files),
                                                   paths=files)

        if dataset is not None and dataset.is_installed():
            res = get_status_dict(action='metadata',
                                  ds=dataset,
                                  refds=dataset.path,
                                  metadata=dsmeta,
                                  status='error' if error else 'ok')
            yield res

        for p in contentmeta:
            res = get_status_dict(action='metadata',
                                  path=opj(dataset.path, p) if dataset else p,
                                  refds=dataset.path,
                                  metadata=contentmeta[p],
                                  type='file',
                                  status='error' if error else 'ok')
            if dataset:
                res['parentds'] = dataset.path
            yield res
Beispiel #2
0
    def __call__(dataset=None, what=None, recursive=False, recursion_limit=None):
        dataset = require_dataset(dataset, purpose='clean-up')

        for dirpath, flag, msg, sing_pl in [
            (ARCHIVES_TEMP_DIR, "cached-archives",
             "temporary archive", ("directory", "directories")),
            (ANNEX_TEMP_DIR, "annex-tmp",
             "temporary annex", ("file", "files")),
        ]:
            lgr.info("Considering to clean %s:%s", dataset, dirpath)
            if not ((what is None) or (flag in what)):
                continue
            topdir = opj(dataset.path, dirpath)
            paths = glob(opj(topdir, '*'))
            if paths:
                pl = len(paths) > 1
                pwd = getpwd()
                # relative version if possible
                rtopdir = topdir[len(pwd) + 1:] \
                    if topdir.startswith(pwd) else topdir
                ui.message("Removing %d %s %s under %s: %s"
                           % (len(paths),
                              msg, sing_pl[int(pl)],
                              rtopdir,
                              ", ".join(sorted([x[len(topdir) + 1:] for x in paths]))))
                rmtree(topdir)

        if recursive:
            for sub in dataset.get_subdatasets(
                recursive=True,
                recursion_limit=recursion_limit,
                absolute=False):
                Dataset(sub).clean(what=what)
Beispiel #3
0
    def __call__(dataset=None,
                 what=None,
                 recursive=False,
                 recursion_limit=None):
        dataset = require_dataset(dataset, purpose='clean-up')

        for dirpath, flag, msg, sing_pl in [
            (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive",
             ("directory", "directories")),
            (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file",
                                                              "files")),
        ]:
            lgr.info("Considering to clean %s:%s", dataset, dirpath)
            if not ((what is None) or (flag in what)):
                continue
            topdir = opj(dataset.path, dirpath)
            paths = glob(opj(topdir, '*'))
            if paths:
                pl = len(paths) > 1
                pwd = getpwd()
                # relative version if possible
                rtopdir = topdir[len(pwd) + 1:] \
                    if topdir.startswith(pwd) else topdir
                ui.message(
                    "Removing %d %s %s under %s: %s" %
                    (len(paths), msg, sing_pl[int(pl)], rtopdir, ", ".join(
                        sorted([x[len(topdir) + 1:] for x in paths]))))
                rmtree(topdir)

        if recursive:
            for sub in dataset.get_subdatasets(recursive=True,
                                               recursion_limit=recursion_limit,
                                               absolute=False):
                Dataset(sub).clean(what=what)
Beispiel #4
0
    def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs):
        # get a handle on the relevant plugin module
        import datalad.export as export_mod
        try:
            exmod = import_module('.%s' % (astype, ),
                                  package=export_mod.__package__)
        except ImportError as e:
            raise ValueError("cannot load exporter '{}': {}".format(
                astype, exc_str(e)))
        if getcmdhelp:
            # no result, but return the module to make the renderer do the rest
            return (exmod, None)

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='exporting')
        # call the plugin, either with the argv array from the cmdline call
        # or directly with the kwargs
        if 'datalad_unparsed_args' in kwargs:
            result = exmod._datalad_export_plugin_call(
                ds, argv=kwargs['datalad_unparsed_args'], output=output)
        else:
            result = exmod._datalad_export_plugin_call(ds,
                                                       output=output,
                                                       **kwargs)
        return (exmod, result)
Beispiel #5
0
    def __call__(spec, dataset=None):
        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            import shlex
            spec = shlex.split(spec)
        name = spec[0]
        args = spec[1:]
        procedure_file = _get_procedure_implementation(name, ds=dataset)
        if not procedure_file:
            # TODO error result
            raise ValueError("Cannot find procedure with name '%s'", name)

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='run a procedure') if dataset else None

        cmd_tmpl = _guess_exec(procedure_file)
        cmd = cmd_tmpl.format(script=procedure_file,
                              ds=ds.path if ds else '',
                              args=u' '.join(u'"{}"'.format(a)
                                             for a in args) if args else '')
        lgr.debug('Attempt to run procedure {} as: {}'.format(name, cmd))
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                # See gh-2593 for discussion on run feature extension
                #explicit=True,
                #inputs=None,
                #outputs=None,
                # pass through here
                on_failure='ignore',
        ):
            yield r
Beispiel #6
0
def no_annex(ds):
    ds = require_dataset(
        ds,
        check_installed=True,
        purpose='configuration')

    if isinstance(ds.repo, AnnexRepo):
        repo = ds.repo
        # TODO: if procedures can have options -- add --force handling/passing
        #
        # annex uninit unlocks files for which there is content (nice) but just proceeds
        # and leaves broken symlinks for files without content.  For the current purpose
        # of this procedure we just prevent "uninit" of any annex with some files already
        # annexed.
        if any(repo.call_annex_items_(['whereis', '--all'])):
            raise RuntimeError("Annex has some annexed files, unsafe")
        # remove annex
        repo.call_annex(['uninit'])

    noannex_file = ds.pathobj / ".noannex"
    if not noannex_file.exists():
        lgr.info("Creating and committing a .noannex file")
        noannex_file.touch()
        ds.save(noannex_file,
                message="Added .noannex to prevent accidental initialization of git-annex")
    def __call__(title, name="osf", dataset=None, mode="annex"):
        ds = require_dataset(dataset,
                             purpose="create OSF remote",
                             check_installed=True)
        # we need an annex
        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(action="create-sibling-osf",
                                  type="dataset",
                                  status="impossible",
                                  message="dataset has no annex")
            return

        # NOTES:
        # - we prob. should check osf-special-remote availability upfront to
        #   fail early
        # - publish-depends option?
        # - (try to) detect github/gitlab/bitbucket to suggest linking it on
        #   OSF and configure publish dependency
        #   -> prob. overkill; just make it clear in the doc
        # - add --recursive option
        #       - recursive won't work easily. Need to think that through.
        #       - would need a naming scheme for subdatasets
        #       - flat on OSF or a tree?
        #       - how do we detect something is there already, so we can skip
        #         rather than duplicate (with a new name)?
        #         osf-type-special-remote sufficient to decide it's not needed?
        # - adapt to conclusions in issue #30
        #   -> create those subcomponents
        # - results need to report URL for created projects suitable for datalad
        #   output formatting!
        #   -> result_renderer
        #   -> needs to ne returned by create_project

        # - option: Make public!

        cred = get_credentials(allow_interactive=True)
        osf = OSF(**cred)
        proj_id, proj_url = create_project(osf_session=osf.session,
                                           title=title)
        yield get_status_dict(action="create-project-osf",
                              type="dataset",
                              url=proj_url,
                              id=proj_id,
                              status="ok")

        init_opts = [
            "encryption=none", "type=external", "externaltype=osf",
            "autoenable=true", "project={}".format(proj_id)
        ]

        if mode == "export":
            init_opts += ["exporttree=yes"]

        ds.repo.init_remote(name, options=init_opts)
        # TODO: add special remote name to result?
        #       need to check w/ datalad-siblings conventions
        yield get_status_dict(action="add-sibling-osf",
                              type="dataset",
                              status="ok")
Beispiel #8
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=None,
                 mode=None,
                 full_record=False,
                 show_keys=None,
                 show_query=False):
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        if mode is None:
            # let's get inspired by what the dataset/user think is
            # default
            mode = ds.config.obtain('datalad.search.default-mode')

        if mode == 'egrep':
            searcher = _EGrepSearch
        elif mode == 'egrepcs':
            searcher = _EGrepCSSearch
        elif mode == 'textblob':
            searcher = _BlobSearch
        elif mode == 'autofield':
            searcher = _AutofieldSearch
        else:
            raise ValueError(
                'unknown search mode "{}"'.format(mode))

        searcher = searcher(ds, force_reindex=force_reindex)

        if show_keys:
            searcher.show_keys(show_keys)
            return

        if not query:
            return

        if show_query:
            print(repr(searcher.get_query(query)))
            return

        nhits = 0
        for r in searcher(
                query,
                max_nresults=max_nresults,
                full_record=full_record):
            nhits += 1
            yield r
        if not nhits:
            lgr.info(searcher.get_nohits_msg() or 'no hits')
Beispiel #9
0
    def __call__(name, url=None, dataset=None, execute=None, image=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add container')

        loc_cfg_var = "datalad.containers.location"

        # TODO: We should provide an entry point (or sth similar) for extensions
        # to get config definitions into the ConfigManager. In other words an
        # easy way to extend definitions in datalad's common_cfgs.py.
        container_loc = \
            ds.config.obtain(loc_cfg_var,
                             where=definitions[loc_cfg_var]['destination'],
                             store=True,
                             default=definitions[loc_cfg_var]['default'],
                             dialog_type=definitions[loc_cfg_var]['ui'][0],
                             valtype=definitions[loc_cfg_var]['type'],
                             **definitions[loc_cfg_var]['ui'][1]
                             )

        result = {
            "action": "containers_add",
            "path": op.join(ds.path, container_loc, name),
            "type": "file"
        }

        if not url:
            url = ds.config.get("datalad.containers.{}.url".format(name))
        if not url:
            raise InsufficientArgumentsError(
                "URL is required and can be provided either via parameter "
                "'url' or config key 'datalad.containers.{}.url'"
                "".format(name))

        try:
            ds.repo.add_url_to_file(op.join(container_loc, name), url)
            ds.save(
                op.join(container_loc, name),
                message="[DATALAD] Added container {name}".format(name=name))
            result["status"] = "ok"
        except Exception as e:
            result["status"] = "error"
            result["message"] = str(e)

        yield result

        # store configs
        ds.config.set("datalad.containers.{}.url".format(name), url)
        if execute:
            ds.config.add("datalad.containers.{}.exec".format(name), execute)
        if image:
            ds.config.add("datalad.containers.{}.image".format(name), image)

        # TODO: Just save won't work in direct mode, since save fails to detect changes
        ds.add(op.join(".datalad", "config"),
               message="[DATALAD] Store config for container '{name}'"
               "".format(name=name))
Beispiel #10
0
    def __call__(dataset=None, sensitive=None, clipboard=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetArgumentFound
        from datalad.interface.results import get_status_dict

        ds = None
        try:
            ds = require_dataset(dataset, check_installed=False, purpose='reporting')
        except NoDatasetArgumentFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # we don't deal with absent datasets
            ds = None
        if sensitive:
            if ds is None:
                from datalad import cfg
            else:
                cfg = ds.config
        else:
            cfg = None

        from datalad.ui import ui
        from datalad.support.external_versions import external_versions

        infos = {}
        res = get_status_dict(
            action='wtf',
            path=ds.path if ds else op.abspath(op.curdir),
            type='dataset' if ds else 'directory',
            status='ok',
            logger=lgr,
            infos=infos,
        )
        infos['datalad'] = _describe_datalad()
        infos['git-annex'] = _describe_annex()
        infos['system'] = _describe_system()
        infos['environment'] = _describe_environment()
        infos['configuration'] = _describe_configuration(cfg, sensitive)
        infos['extentions'] = _describe_extensions()
        infos['metadata_extractors'] = _describe_metadata_extractors()
        infos['dependencies'] = _describe_dependencies()
        if ds:
            try:
                infos['dataset'] = _describe_dataset(ds, sensitive)
            except InvalidGitRepositoryError as e:
                infos['dataset'] = {"invalid": exc_str(e)}

        if clipboard:
            external_versions.check(
                'pyperclip', msg="It is needed to be able to use clipboard")
            import pyperclip
            report = _render_report(res)
            pyperclip.copy(report)
            ui.message("WTF information of length %s copied to clipboard"
                       % len(report))
        yield res
        return
Beispiel #11
0
 def __call__(dataset=None,
              what=None,
              recursive=False,
              recursion_limit=None):
     ds = require_dataset(dataset, purpose='clean-up')
     res_kwargs = dict(action='clean', logger=lgr, refds=ds.path)
     for ap in AnnotatePaths.__call__(dataset=ds.path,
                                      recursive=recursive,
                                      recursion_limit=recursion_limit,
                                      action='clean',
                                      unavailable_path_status='impossible',
                                      nondataset_path_status='impossible',
                                      return_type='generator',
                                      on_failure='ignore'):
         if ap.get('status', None):
             yield ap
             continue
         if ap.get('type', None) != 'dataset':
             ap.update(status='impossible',
                       message='only datasets can be cleaned')
             yield ap
             continue
         d = ap['path']
         gitdir = get_git_dir(d)
         for dirpath, flag, msg, sing_pl in [
             (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive",
              ("directory", "directories")),
             (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file",
                                                               "files")),
             (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index',
              "metadata search index", ("file", "files")),
         ]:
             topdir = opj(d, dirpath)
             lgr.debug("Considering to clean %s:%s", d, dirpath)
             if not ((what is None) or (flag in what)):
                 yield get_status_dict(path=topdir,
                                       status='notneeded',
                                       type='directory',
                                       **res_kwargs)
                 continue
             paths = glob(opj(topdir, '*'))
             if not paths:
                 yield get_status_dict(path=topdir,
                                       status='notneeded',
                                       type='directory',
                                       **res_kwargs)
                 continue
             pl = len(paths) > 1
             message = ("Removed %d %s %s: %s", len(paths), msg,
                        sing_pl[int(pl)], ", ".join(
                            sorted([x[len(topdir) + 1:] for x in paths])))
             rmtree(topdir)
             yield get_status_dict(path=topdir,
                                   status='ok',
                                   type='dir',
                                   message=message,
                                   **res_kwargs)
    def __call__(dataset=None, recursive=False, contains=None):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='list containers')
        refds = ds.path

        if recursive:
            for sub in ds.subdatasets(
                    contains=contains,
                    on_failure='ignore',
                    return_type='generator',
                    result_renderer='disabled'):
                subds = Dataset(sub['path'])
                if subds.is_installed():
                    for c in subds.containers_list(recursive=recursive,
                                                   return_type='generator',
                                                   on_failure='ignore',
                                                   result_filter=None,
                                                   result_renderer=None,
                                                   result_xfm=None):
                        c['name'] = sub['gitmodule_name'] + '/' + c['name']
                        c['refds'] = refds
                        yield c

        # all info is in the dataset config!
        var_prefix = 'datalad.containers.'
        containers = {}
        for var, value in ds.config.items():
            if not var.startswith(var_prefix):
                # not an interesting variable
                continue
            var_comps = var[len(var_prefix):].split('.')
            cname = var_comps[0]
            ccfgname = '.'.join(var_comps[1:])
            if not ccfgname:
                continue

            cinfo = containers.get(cname, {})
            cinfo[ccfgname] = value

            containers[cname] = cinfo

        for k, v in containers.items():
            if 'image' not in v:
                # there is no container location configured
                continue
            res = get_status_dict(
                status='ok',
                action='containers',
                name=k,
                type='file',
                path=op.join(ds.path, v.pop('image')),
                refds=refds,
                parentds=ds.path,
                # TODO
                #state='absent' if ... else 'present'
                **v)
            yield res
Beispiel #13
0
    def __call__(path=None,
                 *,
                 dataset=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_subdataset_state='full',
                 report_filetype=None):
        if report_filetype is not None:
            warnings.warn(
                "status(report_filetype=) no longer supported, and will be removed "
                "in a future release", DeprecationWarning)

        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a constraint (e.g. to figure out which subdataset a path is
        # in) `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='report status')
        ds_path = ds.path
        queried = set()
        content_info_cache = {}
        for res in _yield_paths_by_ds(ds, dataset, ensure_list(path)):
            if 'status' in res:
                # this is an error
                yield res
                continue
            for r in yield_dataset_status(
                    res['ds'],
                    res['paths'],
                    annex,
                    untracked,
                    recursion_limit
                    if recursion_limit is not None else -1 if recursive else 0,
                    queried,
                    eval_subdataset_state,
                    None,
                    content_info_cache,
                    reporting_order='depth-first'):
                if 'status' not in r:
                    r['status'] = 'ok'
                yield dict(
                    r,
                    refds=ds_path,
                    action='status',
                )
    def __call__(extractor_name: str,
                 path: Optional[str] = None,
                 dataset: Optional[Union[Dataset, str]] = None,
                 into: Optional[Union[Dataset, str]] = None):

        # Get basic arguments
        source_dataset = require_dataset(dataset or curdir,
                                         purpose="extract metadata",
                                         check_installed=path is not None)
        source_primary_data_version = source_dataset.repo.get_hexsha()

        if into:
            into_ds = require_dataset(into,
                                      purpose="extract metadata",
                                      check_installed=True)
            realm = into_ds.repo
            root_primary_data_version = into_ds.repo.get_hexsha(
            )  # TODO: check for adjusted/managed branch, use get_corresponding_branch
        else:
            realm = source_dataset.repo
            root_primary_data_version = source_primary_data_version

        extractor_class = get_extractor_class(extractor_name)
        dataset_tree_path, file_tree_path = get_path_info(
            source_dataset, path, into)

        extraction_parameters = ExtractionParameter(
            realm, source_dataset, UUID(source_dataset.id), extractor_class,
            extractor_name, dataset_tree_path, file_tree_path,
            root_primary_data_version, source_primary_data_version,
            dataset.config.get("user.name"), dataset.config.get("user.email"))

        # If a path is given, we assume file-level metadata extraction is
        # requested, and the extractor class is  a subclass of
        # FileMetadataExtractor. If oath is not given, we assume that
        # dataset-level extraction is requested and the extractor
        # class is a subclass of DatasetMetadataExtractor
        if path:
            yield from do_file_extraction(extraction_parameters)
        else:
            yield from do_dataset_extraction(extraction_parameters)

        return
Beispiel #15
0
    def __call__(dataset=None, sensitive=None, clipboard=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetArgumentFound
        from datalad.interface.results import get_status_dict

        ds = None
        try:
            ds = require_dataset(dataset, check_installed=False, purpose='reporting')
        except NoDatasetArgumentFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # we don't deal with absent datasets
            ds = None
        if sensitive:
            if ds is None:
                from datalad import cfg
            else:
                cfg = ds.config
        else:
            cfg = None

        from datalad.ui import ui
        from datalad.support.external_versions import external_versions

        infos = {}
        res = get_status_dict(
            action='wtf',
            path=ds.path if ds else op.abspath(op.curdir),
            type='dataset' if ds else 'directory',
            status='ok',
            logger=lgr,
            infos=infos,
        )
        infos['datalad'] = _describe_datalad()
        infos['git-annex'] = _describe_annex()
        infos['system'] = _describe_system()
        infos['environment'] = _describe_environment()
        infos['configuration'] = _describe_configuration(cfg, sensitive)
        infos['extentions'] = _describe_extensions()
        infos['metadata_extractors'] = _describe_metadata_extractors()
        infos['dependencies'] = _describe_dependencies()
        if ds:
            infos['dataset'] = _describe_dataset(ds, sensitive)

        if clipboard:
            external_versions.check(
                'pyperclip', msg="It is needed to be able to use clipboard")
            import pyperclip
            report = _render_report(res)
            pyperclip.copy(report)
            ui.message("WTF information of length %s copied to clipboard"
                       % len(report))
        yield res
        return
Beispiel #16
0
 def __call__(dataset=None, what=None, recursive=False, recursion_limit=None):
     ds = require_dataset(dataset, purpose='clean-up')
     res_kwargs = dict(action='clean', logger=lgr, refds=ds.path)
     for ap in AnnotatePaths.__call__(
             dataset=ds.path,
             recursive=recursive,
             recursion_limit=recursion_limit,
             action='clean',
             unavailable_path_status='impossible',
             nondataset_path_status='impossible',
             return_type='generator',
             on_failure='ignore'):
         if ap.get('status', None):
             yield ap
             continue
         if ap.get('type', None) != 'dataset':
             ap.update(status='impossible',
                       message='only datasets can be cleaned')
             yield ap
             continue
         d = ap['path']
         gitdir = GitRepo.get_git_dir(d)
         DIRS_PLURAL = ("directory", "directories")
         FILES_PLURAL = ("file", "files")
         for dirpath, flag, msg, sing_pl in [
             (ARCHIVES_TEMP_DIR, "cached-archives",
              "temporary archive", DIRS_PLURAL),
             (ANNEX_TEMP_DIR, "annex-tmp",
              "temporary annex", FILES_PLURAL),
             (ANNEX_TRANSFER_DIR, "annex-transfer",
              "annex temporary transfer", DIRS_PLURAL),
             (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index',
              "metadata search index", FILES_PLURAL),
         ]:
             topdir = opj(d, dirpath)
             lgr.debug("Considering to clean %s:%s", d, dirpath)
             if not ((what is None) or (flag in what)):
                 yield get_status_dict(
                     path=topdir, status='notneeded', type='directory', **res_kwargs)
                 continue
             paths = glob(opj(topdir, '*'))
             if not paths:
                 yield get_status_dict(
                     path=topdir, status='notneeded', type='directory', **res_kwargs)
                 continue
             pl = len(paths) > 1
             message = ("Removed %d %s %s: %s",
                        len(paths), msg, sing_pl[int(pl)],
                        ", ".join(sorted([x[len(topdir) + 1:] for x in paths])))
             rmtree(topdir)
             yield get_status_dict(
                 path=topdir, status='ok', type='dir', message=message,
                 **res_kwargs)
Beispiel #17
0
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [
                rev_resolve_path(c, dataset) for c in assure_list(contains)
            ]
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #18
0
 def __call__(dataset=None,
              what=None,
              recursive=False,
              recursion_limit=None):
     ds = require_dataset(dataset, purpose='clean-up')
     res_kwargs = dict(action='clean', logger=lgr, refds=ds.path)
     for wds in itertools.chain(
         [ds],
             ds.subdatasets(fulfilled=True,
                            recursive=recursive,
                            recursion_limit=recursion_limit,
                            return_type='generator',
                            result_renderer='disabled',
                            result_xfm='datasets') if recursive else []):
         d = wds.path
         gitdir = GitRepo.get_git_dir(d)
         DIRS_PLURAL = ("directory", "directories")
         FILES_PLURAL = ("file", "files")
         for dirpath, flag, msg, sing_pl in [
             (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive",
              DIRS_PLURAL),
             (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", FILES_PLURAL),
             (ANNEX_TRANSFER_DIR, "annex-transfer",
              "annex temporary transfer", DIRS_PLURAL),
             (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index',
              "metadata search index", FILES_PLURAL),
         ]:
             topdir = opj(d, dirpath)
             lgr.debug("Considering to clean %s:%s", d, dirpath)
             if not ((what is None) or (flag in what)):
                 yield get_status_dict(path=topdir,
                                       status='notneeded',
                                       type='directory',
                                       **res_kwargs)
                 continue
             paths = glob(opj(topdir, '*'))
             if not paths:
                 yield get_status_dict(path=topdir,
                                       status='notneeded',
                                       type='directory',
                                       **res_kwargs)
                 continue
             pl = len(paths) > 1
             message = ("Removed %d %s %s: %s", len(paths), msg,
                        sing_pl[int(pl)], ", ".join(
                            sorted([x[len(topdir) + 1:] for x in paths])))
             rmtree(topdir)
             yield get_status_dict(path=topdir,
                                   status='ok',
                                   type='dir',
                                   message=message,
                                   **res_kwargs)
Beispiel #19
0
    def __call__(
            path=None,
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)]
        for r in _get_submodules(
                ds, paths, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #20
0
    def __call__(message=None,
                 files=None,
                 dataset=None,
                 all_changes=False,
                 version_tag=None,
                 recursive=False,
                 recursion_limit=None,
                 super_datasets=False):
        if dataset:
            dataset = require_dataset(dataset,
                                      check_installed=True,
                                      purpose='saving')
        content_by_ds, unavailable_paths = Interface._prep(
            path=files,
            dataset=dataset,
            recursive=recursive,
            recursion_limit=recursion_limit)
        if unavailable_paths:
            lgr.warning("ignoring non-existent path(s): %s", unavailable_paths)
        # here we know all datasets associated with any inputs
        # so we can expand "all_changes" right here to avoid confusion
        # wrt to "super" and "intermediate" datasets discovered later on
        if all_changes:
            # and we do this by replacing any given paths with the respective
            # datasets' base path
            for ds in content_by_ds:
                content_by_ds[ds] = [ds]

        if super_datasets:
            content_by_ds = amend_pathspec_with_superdatasets(
                content_by_ds,
                # save up to and including the base dataset (if one is given)
                # otherwise up to the very top
                topmost=dataset if dataset else True,
                limit_single=False)

        if dataset:
            # stuff all paths also into the base dataset slot to make sure
            # we get all links between relevant subdatasets
            bp = content_by_ds.get(dataset.path, [])
            for c in content_by_ds:
                bp.extend(content_by_ds[c])
            content_by_ds[dataset.path] = list(set(bp))

        saved_ds = save_dataset_hierarchy(
            content_by_ds,
            base=dataset.path if dataset and dataset.is_installed() else None,
            message=message,
            version_tag=version_tag)

        return saved_ds
Beispiel #21
0
    def __call__(extractorname: str,
                 path: Optional[str] = None,
                 dataset: Optional[Union[Dataset, str]] = None,
                 extractorargs: Optional[List[str]] = None):

        # Get basic arguments
        extractor_name = extractorname
        extractor_args = extractorargs
        path = None if path == "++" else path

        source_dataset = require_dataset(dataset or curdir,
                                         purpose="extract metadata",
                                         check_installed=path is not None)

        if not source_dataset.repo:
            raise ValueError(f"No dataset found in {dataset or curdir}.")

        source_dataset_version = source_dataset.repo.get_hexsha()

        extractor_class = get_extractor_class(extractor_name)
        dataset_tree_path, file_tree_path = get_path_info(
            source_dataset,
            Path(path) if path else None, None)

        extraction_parameters = ExtractionParameter(
            source_dataset=source_dataset,
            source_dataset_id=UUID(source_dataset.id),
            source_dataset_version=source_dataset_version,
            extractor_class=extractor_class,
            extractor_name=extractor_name,
            extractor_arguments=args_to_dict(extractor_args),
            file_tree_path=file_tree_path,
            agent_name=source_dataset.config.get("user.name"),
            agent_email=source_dataset.config.get("user.email"))

        # If a path is given, we assume file-level metadata extraction is
        # requested, and the extractor class should be a subclass of
        # FileMetadataExtractor (or a legacy extractor).
        # If path is not given, we assume that a dataset-level extraction is
        # requested and the extractor class is a subclass of
        # DatasetMetadataExtractor (or a legacy extractor class).
        path = None if path == "++" else path

        if path:
            # Check whether the path points to a sub_dataset.
            ensure_path_validity(source_dataset, file_tree_path)
            yield from do_file_extraction(extraction_parameters)
        else:
            yield from do_dataset_extraction(extraction_parameters)
        return
Beispiel #22
0
    def __call__(types, files=None, dataset=None):
        dataset = require_dataset(dataset or curdir,
                                  purpose="extract metadata",
                                  check_installed=not files)
        if not files:
            ds = require_dataset(dataset, check_installed=True)
            subds = ds.subdatasets(recursive=False, result_xfm='relpaths')
            files = list(_get_metadatarelevant_paths(ds, subds))

        dsmeta, contentmeta, error = _get_metadata(
            dataset,
            types,
            global_meta=True,
            content_meta=bool(files),
            paths=files)

        if dataset is not None and dataset.is_installed():
            res = get_status_dict(
                action='metadata',
                ds=dataset,
                refds=dataset.path,
                metadata=dsmeta,
                status='error' if error else 'ok')
            yield res

        for p in contentmeta:
            res = get_status_dict(
                action='metadata',
                path=opj(dataset.path, p) if dataset else p,
                refds=dataset.path,
                metadata=contentmeta[p],
                type='file',
                status='error' if error else 'ok')
            if dataset:
                res['parentds'] = dataset.path
            yield res
Beispiel #23
0
    def __call__(message=None, files=None, dataset=None,
                 all_changes=False, version_tag=None,
                 recursive=False, recursion_limit=None, super_datasets=False):
        if dataset:
            dataset = require_dataset(
                dataset, check_installed=True, purpose='saving')
        content_by_ds, unavailable_paths = Interface._prep(
            path=files,
            dataset=dataset,
            recursive=recursive,
            recursion_limit=recursion_limit)
        if unavailable_paths:
            lgr.warning("ignoring non-existent path(s): %s",
                        unavailable_paths)
        # here we know all datasets associated with any inputs
        # so we can expand "all_changes" right here to avoid confusion
        # wrt to "super" and "intermediate" datasets discovered later on
        if all_changes:
            # and we do this by replacing any given paths with the respective
            # datasets' base path
            for ds in content_by_ds:
                content_by_ds[ds] = [ds]

        if super_datasets:
            content_by_ds = amend_pathspec_with_superdatasets(
                content_by_ds,
                # save up to and including the base dataset (if one is given)
                # otherwise up to the very top
                topmost=dataset if dataset else True,
                limit_single=False)

        if dataset:
            # stuff all paths also into the base dataset slot to make sure
            # we get all links between relevant subdatasets
            bp = content_by_ds.get(dataset.path, [])
            for c in content_by_ds:
                bp.extend(content_by_ds[c])
            content_by_ds[dataset.path] = list(set(bp))

        saved_ds = save_dataset_hierarchy(
            content_by_ds,
            base=dataset.path if dataset and dataset.is_installed() else None,
            message=message,
            version_tag=version_tag)

        return saved_ds
    def __call__(name, dataset=None, remove_image=False):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='remove a container')

        res = get_status_dict(
            ds=ds,
            action='containers_remove',
            logger=lgr)

        section = 'datalad.containers.{}'.format(name)
        imagecfg = '{}.image'.format(section)

        to_save = []
        if remove_image and imagecfg in ds.config:
            imagepath = ds.config.get(imagecfg)
            if op.lexists(op.join(ds.path, imagepath)):
                for r in ds.remove(
                        path=imagepath,
                        # XXX shortcomming: this is the only way to say:
                        # don't drop
                        check=False,
                        # config setting might be outdated and image no longer
                        # there -> no reason to fail, just report
                        on_failure='ignore',
                        save=False):
                    yield r
                to_save.append(imagepath)

        if section in ds.config.sections():
            ds.config.remove_section(
                section,
                where='dataset',
                reload=True)
            res['status'] = 'ok'
            to_save.append(op.join('.datalad', 'config'))
        else:
            res['status'] = 'notneeded'
        if to_save:
            for r in ds.save(
                    path=to_save,
                    message='[DATALAD] Remove container {}'.format(name)):
                yield r
        yield res
    def __call__(dataset=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='list containers')

        loc_cfg_var = "datalad.containers.location"

        # TODO: We should provide an entry point (or sth similar) for extensions
        # to get config definitions into the ConfigManager. In other words an
        # easy way to extend definitions in datalad's common_cfgs.py.
        container_loc = \
            ds.config.obtain(loc_cfg_var,
                             where=definitions[loc_cfg_var]['destination'],
                             store=True,
                             default=definitions[loc_cfg_var]['default'],
                             dialog_type=definitions[loc_cfg_var]['ui'][0],
                             valtype=definitions[loc_cfg_var]['type'],
                             **definitions[loc_cfg_var]['ui'][1]
                             )

        from six import PY3

        try:
            location_content = listdir(op.join(ds.path, container_loc))
        except FileNotFoundError if PY3 else (OSError, IOError) as e:
            # TODO: Right now, just retunr nothing, since there is nothing
            # But may also be an "impossible" result, since the configured
            # common mountpoint isn't existing (needs "e.errno == errno.ENOENT"
            # in addition in PY2)
            return

        for r in [n for n in location_content if not n.startswith(".")]:
            yield {
                'status': 'ok',
                'action': 'containers_list',
                'path': op.join(ds.path, container_loc, r),
                # TODO: Might be an image file or a dataset.
                # Use AnnotatePath with container_loc?
                'type': 'file',
                'name': r,
            }
Beispiel #26
0
    def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs):
        # get a handle on the relevant plugin module
        import datalad.export as export_mod
        try:
            exmod = import_module('.%s' % (astype,), package=export_mod.__package__)
        except ImportError as e:
            raise ValueError("cannot load exporter '{}': {}".format(
                astype, exc_str(e)))
        if getcmdhelp:
            # no result, but return the module to make the renderer do the rest
            return (exmod, None)

        ds = require_dataset(dataset, check_installed=True, purpose='exporting')
        # call the plugin, either with the argv array from the cmdline call
        # or directly with the kwargs
        if 'datalad_unparsed_args' in kwargs:
            result = exmod._datalad_export_plugin_call(
                ds, argv=kwargs['datalad_unparsed_args'], output=output)
        else:
            result = exmod._datalad_export_plugin_call(
                ds, output=output, **kwargs)
        return (exmod, result)
Beispiel #27
0
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #28
0
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #29
0
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = require_dataset(
            dataset, check_installed=True,
            purpose='modifying subdataset URLs')
        assert(ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [GitRepo(opj(ds.path, sub_path))
                                for sub_path in
                                ds.get_subdatasets(recursive=True)]

        for dataset_repo in repos_to_update:
            parser = get_module_parser(dataset_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                parser.set_value(submodule_section, "url",
                                 url.replace("%NAME",
                                             submodule_name.replace("/", "-")))

        return  # TODO: return value?
    def __call__(dataset=None):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='list containers')

        # all info is in the dataset config!
        var_prefix = 'datalad.containers.'
        containers = {}
        for var, value in ds.config.items():
            if not var.startswith(var_prefix):
                # not an interesting variable
                continue
            var_comps = var[len(var_prefix):].split('.')
            cname = var_comps[0]
            ccfgname = '.'.join(var_comps[1:])
            if not ccfgname:
                continue

            cinfo = containers.get(cname, {})
            cinfo[ccfgname] = value

            containers[cname] = cinfo

        for k, v in containers.items():
            if 'image' not in v:
                # there is no container location configured
                continue
            res = get_status_dict(
                status='ok',
                action='containers',
                name=k,
                type='file',
                path=op.join(ds.path, v.pop('image')),
                # TODO
                #state='absent' if ... else 'present'
                **v)
            yield res
Beispiel #31
0
    def __call__(url, dataset=None, recursive=False):

        # shortcut
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='modifying subdataset URLs')
        assert (ds.repo is not None)

        repos_to_update = [ds.repo]
        if recursive:
            repos_to_update += [
                GitRepo(opj(ds.path, sub_path))
                for sub_path in ds.get_subdatasets(recursive=True)
            ]

        for dataset_repo in repos_to_update:
            parser = get_module_parser(dataset_repo)
            for submodule_section in parser.sections():
                submodule_name = submodule_section[11:-1]
                parser.set_value(
                    submodule_section, "url",
                    url.replace("%NAME", submodule_name.replace("/", "-")))

        return  # TODO: return value?
Beispiel #32
0
    def __call__(
            action='query',
            dataset=None,
            name=None,
            url=None,
            pushurl=None,
            description=None,
            # TODO consider true, for now like add_sibling
            fetch=False,
            as_common_datasrc=None,
            publish_depends=None,
            publish_by_default=None,
            annex_wanted=None,
            annex_required=None,
            annex_group=None,
            annex_groupwanted=None,
            inherit=False,
            get_annex_info=True,
            recursive=False,
            recursion_limit=None):

        # TODO: Detect malformed URL and fail?
        # XXX possibly fail if fetch is False and as_common_datasrc

        if annex_groupwanted and not annex_group:
            raise InsufficientArgumentsError(
                "To set groupwanted, you need to provide annex_group option")

        # TODO catch invalid action specified
        action_worker_map = {
            'query': _query_remotes,
            'add': _add_remote,
            'configure': _configure_remote,
            'remove': _remove_remote,
            'enable': _enable_remote,
        }
        # all worker strictly operate on a single dataset
        # anything that deals with hierarchies and/or dataset
        # relationships in general should be dealt with in here
        # at the top-level and vice versa
        worker = action_worker_map[action]

        dataset = require_dataset(
            dataset, check_installed=False, purpose='sibling configuration')
        refds_path = dataset.path

        res_kwargs = dict(refds=refds_path, logger=lgr)

        ds_name = basename(dataset.path)

        # do not form single list of datasets (with recursion results) to
        # give fastest possible response, for the precise of a long-all
        # function call
        ds = dataset
        for r in worker(
                # always copy signature to below to avoid bugs!
                ds, name,
                ds.repo.get_remotes(),
                # for top-level dataset there is no layout questions
                _mangle_urls(url, ds_name),
                _mangle_urls(pushurl, ds_name),
                fetch, description,
                as_common_datasrc, publish_depends, publish_by_default,
                annex_wanted, annex_required, annex_group, annex_groupwanted,
                inherit, get_annex_info,
                **res_kwargs):
            yield r
        if not recursive:
            return

        # do we have instructions to register siblings with some alternative
        # layout?
        replicate_local_structure = url and "%NAME" not in url

        for subds in dataset.subdatasets(
                fulfilled=True,
                recursive=recursive, recursion_limit=recursion_limit,
                result_xfm='datasets'):
            subds_name = relpath(subds.path, start=dataset.path)
            if replicate_local_structure:
                subds_url = slash_join(url, subds_name)
                subds_pushurl = slash_join(pushurl, subds_name)
            else:
                subds_url = \
                    _mangle_urls(url, '/'.join([ds_name, subds_name]))
                subds_pushurl = \
                    _mangle_urls(pushurl, '/'.join([ds_name, subds_name]))
            for r in worker(
                    # always copy signature from above to avoid bugs
                    subds, name,
                    subds.repo.get_remotes(),
                    subds_url,
                    subds_pushurl,
                    fetch,
                    description,
                    as_common_datasrc, publish_depends, publish_by_default,
                    annex_wanted, annex_required, annex_group, annex_groupwanted,
                    inherit, get_annex_info,
                    **res_kwargs):
                yield r
Beispiel #33
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            fake_dates=False,
            cfg_proc=None
    ):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # prep for yield
        res = dict(action='create', path=text_type(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        dataset, text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(
                    check_path == p or check_path in p.parents
                    for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        text_type(parentds_path),
                        [text_type(c) for c in conflict])})
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'}
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with %s (dataset) in dataset %s',
                        text_type(conflict[0]),
                        text_type(parentds_path))})
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbds.repo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r
Beispiel #34
0
    def __call__(
            dataset,
            guess_native_type=False,
            recursive=False,
            recursion_limit=None,
            save=True,
            if_dirty='save-before'):
        ds = require_dataset(
            dataset, check_installed=True, purpose='meta data aggregation')
        modified_ds = []
        if ds.id is None:
            lgr.warning('%s has not configured ID, skipping.', dataset)
            return modified_ds
        # make sure we get to an expected state
        handle_dirty_dataset(ds, if_dirty)

        # if you want to modify the behavior of get_subdataset() make sure
        # there is a way to return the subdatasets DEPTH FIRST!
        ds_meta = {}
        for subds_path in ds.get_subdatasets(
                fulfilled=True,
                absolute=False,
                recursive=recursive,
                recursion_limit=recursion_limit):
            subds = Dataset(opj(ds.path, subds_path))
            if subds.id is None:
                # nothing to worry about, any meta data from below this will be
                # injected upstairs
                lgr.debug('skipping non-dataset at %s', subds.path)
                continue
            else:
                lgr.info('aggregating meta data for %s', subds)
            metapath = opj(subds.path, metadata_basepath)
            handle_dirty_dataset(subds, if_dirty)
            #
            # Phase 1: aggregate the within-dataset meta data, and store
            #          within the dataset
            #
            # pull out meta data from subds only (no subdatasets)
            _within_metadata_store(
                subds,
                guess_native_type,
                metapath)
            #
            # Phase 2: store everything that is in the look up and belongs into
            #          this dataset
            #
            _dump_submeta(subds, ds_meta, subds_path, save, modified_ds)
            # save state of modified dataset, all we modified has been staged
            # already
            # we need to save before extracting to full metadata for upstairs
            # consumption to get the versions right
            modified_ds = _save_helper(subds, save, modified_ds)
            #
            # Phase 3: obtain all aggregated meta data from this dataset, and
            #          keep in lookup to escalate it upstairs
            #
            ds_meta[subds_path] = get_metadata(
                subds,
                guess_type=False,
                ignore_subdatasets=False,
                ignore_cache=False)

        lgr.info('aggregating meta data for %s', ds)
        # pull out meta data from parent only (no subdatasets)
        _within_metadata_store(
            ds,
            guess_native_type,
            opj(ds.path, metadata_basepath))
        # and lastly the subdatasets of the parent
        _dump_submeta(ds, ds_meta, '', save, modified_ds)
        # everything should be stored somewhere by now
        assert not len(ds_meta)

        # save the parent
        modified_ds = _save_helper(ds, save, modified_ds)
Beispiel #35
0
"""Procedure to apply YODA-compatible default setup to a dataset

This procedure assumes a clean dataset that was just created by
`datalad create`.
"""

import sys
import os.path as op

from datalad.distribution.dataset import require_dataset
from datalad.utils import create_tree

ds = require_dataset(
    sys.argv[1],
    check_installed=True,
    purpose='YODA dataset setup')

README_code = """\
All custom code goes into this directory. All scripts should be written such
that they can be executed from the root of the dataset, and are only using
relative paths for portability.
"""

README_top = """\
# Project <insert name>

## Dataset structure

- All inputs (i.e. building blocks from other sources) are located in
  `inputs/`.
- All custom code is located in `code/`.
Beispiel #36
0
"""Procedure to configure Git annex to add text files directly to Git"""

import sys
import os.path as op

from datalad.distribution.dataset import require_dataset

ds = require_dataset(
    sys.argv[1],
    check_installed=True,
    purpose='configuration')

annex_largefiles = '(not(mimetype=text/*))'
attrs = ds.repo.get_gitattributes('*')
if not attrs.get('*', {}).get(
        'annex.largefiles', None) == annex_largefiles:
    ds.repo.set_gitattributes([
        ('*', {'annex.largefiles': annex_largefiles})])

git_attributes_file = op.join(ds.path, '.gitattributes')
ds.save(
    git_attributes_file,
    message="Instruct annex to add text files to Git",
)
Beispiel #37
0
    def __call__(sshurl, target=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None, recursive=False,
                 existing='error', shared=False, ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None or
                               target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')

        assert(ds is not None and sshurl is not None and ds.repo is not None)

        # determine target parameters:
        sshri = RI(sshurl)

        if not isinstance(sshri, SSHRI) \
                and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'):
                    raise ValueError("Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax".format(sshurl))

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_subdatasets(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \
                    Dataset(sub_path)

        # request ssh connection:
        not_supported_on_windows("TODO")
        lgr.info("Connecting ...")
        ssh = ssh_manager.get_connection(sshurl)
        ssh.open()

        # flag to check if at dataset_root
        at_root = True

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        remote_repos_to_run_hook_for = []
        for current_dspath in \
                sorted(datasets.keys(), key=lambda x: x.count('/')):
            current_ds = datasets[current_dspath]
            if not current_ds.is_installed():
                lgr.info("Skipping %s since not installed locally", current_dspath)
                continue
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dspath.replace("/", "-"))
            else:
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(opj(target_dir,
                                    relpath(datasets[current_dspath].path,
                                            start=ds.path)))

            lgr.info("Creating target dataset {0} at {1}".format(current_dspath, path))
            # Must be set to True only if exists and existing='reconfigure'
            # otherwise we might skip actions if we say existing='reconfigure'
            # but it did not even exist before
            only_reconfigure = False
            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                try:
                    out, err = ssh(["ls", path])
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                            path in e.stderr:
                        path_exists = False
                    else:
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'error':
                        raise RuntimeError("Target directory %s already exists." % path)
                    elif existing == 'skip':
                        continue
                    elif existing == 'replace':
                        ssh(["chmod", "+r+w", "-R", path])  # enable write permissions to allow removing dir
                        ssh(["rm", "-rf", path])            # remove target at path
                        path_exists = False                 # if we succeeded in removing it
                    elif existing == 'reconfigure':
                        only_reconfigure = True
                    else:
                        raise ValueError("Do not know how to handle existing=%s" % repr(existing))

                if not path_exists:
                    try:
                        ssh(["mkdir", "-p", path])
                    except CommandError as e:
                        lgr.error("Remotely creating target directory failed at "
                                  "%s.\nError: %s" % (path, exc_str(e)))
                        continue

            # don't (re-)initialize dataset if existing == reconfigure
            if not only_reconfigure:
                # init git and possibly annex repo
                if not CreateSibling.init_remote_repo(
                        path, ssh, shared, datasets[current_dspath],
                        description=target_url):
                    continue

            # check git version on remote end
            lgr.info("Adjusting remote git configuration")
            remote_git_version = CreateSibling.get_remote_git_version(ssh)
            if remote_git_version and remote_git_version >= "2.4":
                # allow for pushing to checked out branch
                try:
                    ssh(["git", "-C", path] +
                        ["config", "receive.denyCurrentBranch", "updateInstead"])
                except CommandError as e:
                    lgr.error("git config failed at remote location %s.\n"
                              "You will not be able to push to checked out "
                              "branch. Error: %s", path, exc_str(e))
            else:
                lgr.error("Git version >= 2.4 needed to configure remote."
                          " Version detected on server: %s\nSkipping configuration"
                          " of receive.denyCurrentBranch - you will not be able to"
                          " publish updates to this repository. Upgrade your git"
                          " and run with --existing=reconfigure"
                          % remote_git_version)

            # enable metadata refresh on dataset updates to publication server
            lgr.info("Enabling git post-update hook ...")
            try:
                CreateSibling.create_postupdate_hook(
                    path, ssh, datasets[current_dspath])
            except CommandError as e:
                lgr.error("Failed to add json creation command to post update "
                          "hook.\nError: %s" % exc_str(e))

            # publish web-interface to root dataset on publication server
            if at_root and ui:
                lgr.info("Uploading web interface to %s" % path)
                at_root = False
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    lgr.error("Failed to push web interface to the remote "
                              "datalad repository.\nError: %s" % exc_str(e))

            remote_repos_to_run_hook_for.append(path)

        # in reverse order would be depth first
        lgr.debug("Running post-update hooks in all created siblings")
        for path in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            try:
                ssh(
                    ["cd '" + _path_(path, ".git") + "' && hooks/post-update"],
                    wrap_args=False  # we wrapped here manually
                )
            except CommandError as e:
                lgr.error("Failed to run post-update hook under path %s. "
                          "Error: %s" % (path, exc_str(e)))

        if target:
            # add the sibling(s):
            lgr.debug("Adding the siblings")
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None and sshurl != target_url:
                target_pushurl = sshurl
            AddSibling()(dataset=ds,
                         name=target,
                         url=target_url,
                         pushurl=target_pushurl,
                         recursive=recursive,
                         fetch=True,
                         force=existing in {'replace'},
                         as_common_datasrc=as_common_datasrc,
                         publish_by_default=publish_by_default,
                         publish_depends=publish_depends)
Beispiel #38
0
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))
            }
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds.repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres
Beispiel #39
0
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None,
                message=None, rerun_info=None, rerun_outputs=None, sidecar=None):
    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')

    # not needed ATM
    #refds_path = ds.path

    # delayed imports
    from datalad.cmd import Runner

    lgr.debug('tracking command output underneath %s', ds)
    if not rerun_info and ds.repo.dirty:  # Rerun already takes care of this.
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('unsaved modifications present, '
                     'cannot detect changes by command'))
        return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd,
                          expand=expand in ["inputs", "both"])
    if inputs:
        for res in ds.get(inputs.expand(full=True), on_failure="ignore"):
            yield res

    outputs = GlobbedPaths(outputs, pwd=pwd,
                           expand=expand in ["outputs", "both"],
                           warn=not rerun_info)
    if outputs:
        for res in _unlock_or_remove(ds, outputs.expand(full=True)):
            yield res

    if rerun_outputs is not None:
        # These are files we need to unlock/remove for a rerun that aren't
        # included in the explicit outputs. Unlike inputs/outputs, these are
        # full paths, so we can pass them directly to unlock.
        for res in _unlock_or_remove(ds, rerun_outputs):
            yield res

    sfmt = SequenceFormatter()
    cmd_expanded = sfmt.format(cmd,
                               pwd=pwd,
                               dspath=ds.path,
                               inputs=inputs.expand(dot=False),
                               outputs=outputs.expand(dot=False))

    # we have a clean dataset, let's run things
    exc = None
    cmd_exitcode = None
    runner = Runner(cwd=pwd)
    try:
        lgr.info("== Command start (output follows) =====")
        runner.run(
            cmd_expanded,
            # immediate output
            log_online=True,
            # not yet sure what we should do with the command output
            # IMHO `run` itself should be very silent and let the command talk
            log_stdout=False,
            log_stderr=False,
            expect_stderr=True,
            expect_fail=True,
            # TODO stdin
        )
    except CommandError as e:
        # strip our own info from the exception. The original command output
        # went to stdout/err -- we just have to exitcode in the same way
        exc = e
        cmd_exitcode = e.code

        if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode:
            # we failed in a different way during a rerun.  This can easily
            # happen if we try to alter a locked file
            #
            # TODO add the ability to `git reset --hard` the dataset tree on failure
            # we know that we started clean, so we could easily go back, needs gh-1424
            # to be able to do it recursively
            raise exc

    lgr.info("== Command exit (modification check follows) =====")

    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode if cmd_exitcode is not None else 0,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    use_sidecar = sidecar or (
        sidecar is None and
        ds.config.get('datalad.run.record-sidecar', default=False))

    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds.path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd),
        '"{}"'.format(record_id) if use_sidecar else record)
    msg = assure_bytes(msg)

    if not rerun_info and cmd_exitcode:
        msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG")
        with open(msg_path, "wb") as ofh:
            ofh.write(msg)
        lgr.info("The command had a non-zero exit code. "
                 "If this is expected, you can save the changes with "
                 "'datalad save -r -F%s .'",
                 msg_path)
        raise exc
    else:
        for r in ds.add('.', recursive=True, message=msg):
            yield r
Beispiel #40
0
def diff_dataset(dataset,
                 fr,
                 to,
                 constant_refs,
                 path=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_file_type=True,
                 reporting_order='depth-first'):
    """Internal helper to diff a dataset

    Parameters
    ----------
    dataset : Dataset
      Dataset to perform the diff on. `fr` and `to` parameters are interpreted
      in the context of this dataset.
    fr : str
      Commit-ish to compare from.
    to : str
      Commit-ish to compare to.
    constant_refs : bool
      If True, `fr` and `to` will be passed on unmodified to diff operations
      on subdatasets. This can be useful with symbolic references like tags
      to report subdataset changes independent of superdataset changes.
      If False, `fr` and `to` will be translated to the subdataset commit-ish
      that match the given commit-ish in the superdataset.
    path : Path-like, optional
      Paths to constrain the diff to (see main diff() command).
    annex : str, optional
      Reporting mode for annex properties (see main diff() command).
    untracked : str, optional
      Reporting mode for untracked content (see main diff() command).
    recursive : bool, optional
      Flag to enable recursive operation (see main diff() command).
    recursion_limit : int, optional
      Recursion limit (see main diff() command).
    eval_file_type : bool, optional
      Whether to perform file type discrimination between real symlinks
      and symlinks representing annex'ed files. This can be expensive
      in datasets with many files.
    reporting_order : {'depth-first', 'breadth-first'}, optional
      By default, subdataset content records are reported after the record
      on the subdataset's submodule in a superdataset (depth-first).
      Alternatively, report all superdataset records first, before reporting
      any subdataset content records (breadth-first).

    Yields
    ------
    dict
      DataLad result records.
    """
    if reporting_order not in ('depth-first', 'breadth-first'):
        raise ValueError('Unknown reporting order: {}'.format(reporting_order))

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='difference reporting')

    # we cannot really perform any sorting of paths into subdatasets
    # or rejecting paths based on the state of the filesystem, as
    # we need to be able to compare with states that are not represented
    # in the worktree (anymore)
    if path:
        ps = []
        # sort any path argument into the respective subdatasets
        for p in sorted(assure_list(path)):
            # it is important to capture the exact form of the
            # given path argument, before any normalization happens
            # distinguish rsync-link syntax to identify
            # a dataset as whole (e.g. 'ds') vs its
            # content (e.g. 'ds/')
            # special case is the root dataset, always report its content
            # changes
            orig_path = str(p)
            resolved_path = resolve_path(p, dataset)
            p = \
                resolved_path, \
                orig_path.endswith(op.sep) or resolved_path == ds.pathobj
            str_path = str(p[0])
            root = get_dataset_root(str_path)
            if root is None:
                # no root, not possibly underneath the refds
                yield dict(action='status',
                           path=str_path,
                           refds=ds.path,
                           status='error',
                           message='path not underneath this dataset',
                           logger=lgr)
                continue
            if path_under_rev_dataset(ds, str_path) is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=str_path,
                    refds=ds.path,
                    action='diff',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str_path),
                    logger=lgr,
                )
                continue

            ps.append(p)
        path = ps

    # TODO we might want to move away from the single-pass+immediate-yield
    # paradigm for this command. If we gather all information first, we
    # could do post-processing and detect when a file (same gitsha, or same
    # key) was copied/moved from another dataset. Another command (e.g.
    # save) could act on this information and also move/copy
    # availability information or at least enhance the respective commit
    # message with cross-dataset provenance info

    # cache to help avoid duplicate status queries
    content_info_cache = {}
    for res in _diff_ds(
            ds,
            fr,
            to,
            constant_refs,
            recursion_limit if recursion_limit is not None and recursive else
            -1 if recursive else 0,
            # TODO recode paths to repo path reference
            origpaths=None if not path else OrderedDict(path),
            untracked=untracked,
            annexinfo=annex,
            eval_file_type=eval_file_type,
            cache=content_info_cache,
            order=reporting_order):
        res.update(
            refds=ds.path,
            logger=lgr,
            action='diff',
        )
        yield res
Beispiel #41
0
    def __call__(*,
                 dataset=None,
                 what=None,
                 dry_run=False,
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             purpose="report on cleanable locations"
                             if dry_run else "clean dataset")
        res_kwargs = dict(action='clean [dry-run]' if dry_run else 'clean',
                          logger=lgr,
                          refds=ds.path)
        for wds in itertools.chain(
            [ds],
                ds.subdatasets(state='present',
                               recursive=recursive,
                               recursion_limit=recursion_limit,
                               return_type='generator',
                               result_renderer='disabled',
                               result_xfm='datasets') if recursive else []):
            d = wds.pathobj
            gitdir = wds.repo.dot_git
            DIRS_PLURAL = ("directory", "directories")
            FILES_PLURAL = ("file", "files")
            discover_or_remove = "Discovered" if dry_run else "Removed"

            for dirpath, flag, msg, sing_pl in [
                (Path(ARCHIVES_TEMP_DIR), "cached-archives",
                 "temporary archive", DIRS_PLURAL),
                (Path(ANNEX_TEMP_DIR), "annex-tmp", "temporary annex",
                 FILES_PLURAL),
                (Path(ANNEX_TRANSFER_DIR), "annex-transfer",
                 "annex temporary transfer", DIRS_PLURAL),
                (gitdir / Path(SEARCH_INDEX_DOTGITDIR), 'search-index',
                 "metadata search index", FILES_PLURAL),
            ]:
                topdir = wds.pathobj / dirpath
                lgr.debug("Considering to clean %s:%s", d, dirpath)
                if not ((what is None) or (flag in what)):
                    yield get_status_dict(path=str(topdir),
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue

                paths = [p for p in topdir.glob('*')]
                if not paths:
                    if not topdir.exists():
                        yield get_status_dict(path=str(topdir),
                                              status='notneeded',
                                              type='directory',
                                              **res_kwargs)
                        continue
                    else:
                        # we empty topdir only
                        message = ("%s empty %s directory", discover_or_remove,
                                   msg)
                else:
                    pl = len(paths) > 1
                    message = ("%s %d %s %s: %s", discover_or_remove,
                               len(paths), msg, sing_pl[int(pl)], ", ".join(
                                   sorted([
                                       str(p.relative_to(topdir))
                                       for p in paths if p != topdir
                                   ])))

                if not dry_run:
                    rmtree(str(topdir))

                yield get_status_dict(path=str(topdir),
                                      status='ok',
                                      type='directory',
                                      message=message,
                                      **res_kwargs)
Beispiel #42
0
    def __call__(
            reponame,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            name='github',
            existing='error',
            github_login=None,
            github_passwd=None,
            github_organization=None,
            access_protocol='https',
            publish_depends=None,
            dryrun=False):
        try:
            # this is an absolute leaf package, import locally to avoid
            # unnecessary dependencies
            import github as gh
        except ImportError:
            raise MissingExternalDependency(
                'PyGitHub',
                msg='GitHub-related functionality is unavailable without this package')

        # what to operate on
        ds = require_dataset(
            dataset, check_installed=True, purpose='create Github sibling')
        # gather datasets and essential info
        # dataset instance and mountpoint relative to the top
        toprocess = [(ds, '')]
        if recursive:
            for sub in ds.subdatasets(
                    fulfilled=None,  # we want to report on missing dataset in here
                    recursive=recursive,
                    recursion_limit=recursion_limit,
                    result_xfm='datasets'):
                if not sub.is_installed():
                    lgr.info('Ignoring unavailable subdataset %s', sub)
                    continue
                toprocess.append((sub, relpath(sub.path, start=ds.path)))

        # check for existing remote configuration
        filtered = []
        for d, mp in toprocess:
            if name in d.repo.get_remotes():
                if existing == 'error':
                    msg = '{} already had a configured sibling "{}"'.format(
                        d, name)
                    if dryrun:
                        lgr.error(msg)
                    else:
                        raise ValueError(msg)
                elif existing == 'skip':
                    continue
            gh_reponame = '{}{}{}'.format(
                reponame,
                '-' if mp else '',
                template_fx(mp))
            filtered.append((d, gh_reponame))

        if not filtered:
            # all skipped
            return []

        # actually make it happen on Github
        rinfo = _make_github_repos(
            gh, github_login, github_passwd, github_organization, filtered,
            existing, access_protocol, dryrun)

        # lastly configure the local datasets
        for d, url, existed in rinfo:
            if not dryrun:
                # first make sure that annex doesn't touch this one
                # but respect any existing config
                ignore_var = 'remote.{}.annex-ignore'.format(name)
                if not ignore_var in d.config:
                    d.config.add(ignore_var, 'true', where='local')
                Siblings()(
                    'configure',
                    dataset=d,
                    name=name,
                    url=url,
                    recursive=False,
                    # TODO fetch=True, maybe only if one existed already
                    publish_depends=publish_depends)

        # TODO let submodule URLs point to Github (optional)
        return rinfo
Beispiel #43
0
    def __call__(
            path=None,
            dataset=None,
            annex=None,
            untracked='normal',
            recursive=False,
            recursion_limit=None,
            eval_subdataset_state='full'):
        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a contraint (e.g. to figure out which subdataset a path is in)
        # `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(
            dataset, check_installed=True, purpose='status reporting')

        paths_by_ds = OrderedDict()
        if path:
            # sort any path argument into the respective subdatasets
            for p in sorted(assure_list(path)):
                # it is important to capture the exact form of the
                # given path argument, before any normalization happens
                # for further decision logic below
                orig_path = text_type(p)
                p = rev_resolve_path(p, dataset)
                root = rev_get_dataset_root(text_type(p))
                if root is None:
                    # no root, not possibly underneath the refds
                    yield dict(
                        action='status',
                        path=p,
                        refds=ds.path,
                        status='error',
                        message='path not underneath this dataset',
                        logger=lgr)
                    continue
                else:
                    if dataset and root == text_type(p) and \
                            not (orig_path.endswith(op.sep) or
                                 orig_path == "."):
                        # the given path is pointing to a dataset
                        # distinguish rsync-link syntax to identify
                        # the dataset as whole (e.g. 'ds') vs its
                        # content (e.g. 'ds/')
                        super_root = rev_get_dataset_root(op.dirname(root))
                        if super_root:
                            # the dataset identified by the path argument
                            # is contained in a superdataset, and no
                            # trailing path separator was found in the
                            # argument -> user wants to address the dataset
                            # as a whole (in the superdataset)
                            root = super_root

                root = ut.Path(root)
                ps = paths_by_ds.get(root, [])
                ps.append(p)
                paths_by_ds[root] = ps
        else:
            paths_by_ds[ds.pathobj] = None

        queried = set()
        content_info_cache = {}
        while paths_by_ds:
            qdspath, qpaths = paths_by_ds.popitem(last=False)
            if qpaths and qdspath in qpaths:
                # this is supposed to be a full query, save some
                # cycles sifting through the actual path arguments
                qpaths = []
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            qds_inrefds = path_under_rev_dataset(ds, qdspath)
            if qds_inrefds is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=text_type(qdspath),
                    refds=ds.path,
                    action='status',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, qpaths),
                    logger=lgr,
                )
                continue
            elif qds_inrefds != qdspath:
                # the path this dataset was located by is not how it would
                # be referenced underneath the refds (possibly resolved
                # realpath) -> recode all paths to be underneath the refds
                qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths]
                qdspath = qds_inrefds
            if qdspath in queried:
                # do not report on a single dataset twice
                continue
            qds = Dataset(text_type(qdspath))
            for r in _yield_status(
                    qds,
                    qpaths,
                    annex,
                    untracked,
                    recursion_limit
                    if recursion_limit is not None else -1
                    if recursive else 0,
                    queried,
                    eval_subdataset_state,
                    content_info_cache):
                yield dict(
                    r,
                    refds=ds.path,
                    action='status',
                    status='ok',
                )
Beispiel #44
0
    def __call__(dataset=None, sensitive=None, sections=None, decor=None, clipboard=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetArgumentFound
        from datalad.interface.results import get_status_dict

        ds = None
        try:
            ds = require_dataset(dataset, check_installed=False, purpose='reporting')
        except NoDatasetArgumentFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # we don't deal with absent datasets
            ds = None
        if sensitive:
            if ds is None:
                from datalad import cfg
            else:
                cfg = ds.config
        else:
            cfg = None

        from datalad.ui import ui
        from datalad.support.external_versions import external_versions

        infos = OrderedDict()
        res = get_status_dict(
            action='wtf',
            path=ds.path if ds else op.abspath(op.curdir),
            type='dataset' if ds else 'directory',
            status='ok',
            logger=lgr,
            decor=decor,
            infos=infos,
        )

        # Define section callables which require variables.
        # so there is no side-effect on module level original
        section_callables = SECTION_CALLABLES.copy()
        section_callables['location'] = partial(_describe_location, res)
        section_callables['configuration'] = \
            partial(_describe_configuration, cfg, sensitive)
        if ds:
            section_callables['dataset'] = \
                partial(_describe_dataset, ds, sensitive)
        else:
            section_callables.pop('dataset')
        assert all(section_callables.values())  # check if none was missed

        if sections is None:
            sections = sorted(list(section_callables))

        for s in sections:
            infos[s] = section_callables[s]()

        if clipboard:
            external_versions.check(
                'pyperclip', msg="It is needed to be able to use clipboard")
            import pyperclip
            report = _render_report(res)
            pyperclip.copy(assure_bytes(report))
            ui.message("WTF information of length %s copied to clipboard"
                       % len(report))
        yield res
        return
Beispiel #45
0
    def __call__(dataset, filename=None, missing_content='error', no_annex=False,
                 # TODO: support working with projects and articles within them
                 # project_id=None,
                 article_id=None):
        import os
        import logging
        lgr = logging.getLogger('datalad.plugin.export_to_figshare')

        from datalad.ui import ui
        from datalad.api import add_archive_content
        from datalad.api import export_archive
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export to figshare')

        if not isinstance(dataset.repo, AnnexRepo):
            raise ValueError(
                "%s is not an annex repo, so annexification could be done"
                % dataset
            )

        if dataset.repo.is_dirty():
            raise RuntimeError(
                "Paranoid authors of DataLad refuse to proceed in a dirty repository"
            )
        if filename is None:
            filename = dataset.path
        lgr.info(
            "Exporting current tree as an archive under %s since figshare "
            "does not support directories",
            filename
        )
        archive_out = next(
            export_archive(
                dataset,
                filename=filename,
                archivetype='zip',
                missing_content=missing_content,
                return_type="generator"
            )
        )
        assert archive_out['status'] == 'ok'
        fname = archive_out['path']

        lgr.info("Uploading %s to figshare", fname)
        figshare = FigshareRESTLaison()

        if not article_id:
            # TODO: ask if it should be an article within a project
            if ui.is_interactive:
                # or should we just upload to a new article?
                if ui.yesno(
                    "Would you like to create a new article to upload to?  "
                    "If not - we will list existing articles",
                    title="Article"
                ):
                    article = figshare.create_article(
                        title=os.path.basename(dataset.path)
                    )
                    lgr.info(
                        "Created a new (private) article %(id)s at %(url_private_html)s. "
                        "Please visit it, enter additional meta-data and make public",
                        article
                    )
                    article_id = article['id']
                else:
                    article_id = int(ui.question(
                        "Which of the articles should we upload to.",
                        choices=list(map(str, figshare.get_article_ids()))
                    ))
            if not article_id:
                raise ValueError("We need an article to upload to.")

        file_info = figshare.upload_file(
            fname,
            files_url='account/articles/%s/files' % article_id
        )

        if no_annex:
            lgr.info("Removing generated tarball")
            unlink(fname)
        else:
            # I will leave all the complaining etc to the dataset add if path
            # is outside etc
            lgr.info("'Registering' %s within annex", fname)
            repo = dataset.repo
            repo.add(fname, git=False)
            key = repo.get_file_key(fname)
            lgr.info("Adding URL %(download_url)s for it", file_info)
            repo._annex_custom_command([],
                [
                    "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false',
                    key, file_info['download_url']
                ]
            )

            lgr.info("Registering links back for the content of the archive")
            add_archive_content(
                fname,
                annex=dataset.repo,
                delete_after=True,  # just remove extracted into a temp dir
                allow_dirty=True,  # since we have a tarball
                commit=False  # we do not want to commit anything we have done here
            )

            lgr.info("Removing generated and now registered in annex archive")
            repo.drop(key, key=True, options=['--force'])
            repo.remove(fname, force=True)  # remove the tarball

            # if annex in {'delete'}:
            #     dataset.repo.remove(fname)
            # else:
            #     # kinda makes little sense I guess.
            #     # Made more sense if export_archive could export an arbitrary treeish
            #     # so we could create a branch where to dump and export to figshare
            #     # (kinda closer to my idea)
            #     dataset.save(fname, message="Added the entire dataset into a zip file")

        # TODO: add to downloader knowledge about figshare token so it could download-url
        # those zipballs before they go public
        yield dict(
            status='ok',
            # TODO: add article url (which needs to be queried if only ID is known
            message="Published archive {}".format(
                file_info['download_url']),
            file_info=file_info,
            path=dataset,
            action='export_to_figshare',
            logger=lgr
        )
Beispiel #46
0
    def __call__(
            path=None,
            sibling=None,
            merge=False,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            fetch_all=None,
            reobtain_data=False):
        """
        """
        if fetch_all is not None:
            lgr.warning('update(fetch_all=...) called. Option has no effect, and will be removed')

        if not dataset and not path:
            # try to find a dataset in PWD
            dataset = require_dataset(
                None, check_installed=True, purpose='updating')
        refds_path = Interface.get_refds_path(dataset)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        save_paths = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='update',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if not ap.get('type', None) == 'dataset':
                ap.update(
                    status='impossible',
                    message="can only update datasets")
                yield ap
                continue
            # this is definitely as dataset from here on
            ds = Dataset(ap['path'])
            if not ds.is_installed():
                lgr.debug("Skipping update since not installed %s", ds)
                continue
            repo = ds.repo
            # prepare return value
            # TODO reuse AP for return props
            res = get_status_dict('update', ds=ds, logger=lgr, refds=refds_path)
            # get all remotes which have references (would exclude
            # special remotes)
            remotes = repo.get_remotes(
                **({'exclude_special_remotes': True} if isinstance(repo, AnnexRepo) else {}))
            if not remotes and not sibling:
                res['message'] = ("No siblings known to dataset at %s\nSkipping",
                                  repo.path)
                res['status'] = 'notneeded'
                yield res
                continue
            if not sibling and len(remotes) == 1:
                # there is only one remote, must be this one
                sibling_ = remotes[0]
            elif not sibling:
                # nothing given, look for tracking branch
                sibling_ = repo.get_tracking_branch()[0]
            else:
                sibling_ = sibling
            if sibling_ and sibling_ not in remotes:
                res['message'] = ("'%s' not known to dataset %s\nSkipping",
                                  sibling_, repo.path)
                res['status'] = 'impossible'
                yield res
                continue
            if not sibling_ and len(remotes) > 1 and merge:
                lgr.debug("Found multiple siblings:\n%s" % remotes)
                res['status'] = 'impossible'
                res['message'] = "Multiple siblings, please specify from which to update."
                yield res
                continue
            lgr.info("Fetching updates for %s", ds)
            # fetch remote
            fetch_kwargs = dict(
                # test against user-provided value!
                remote=None if sibling is None else sibling_,
                all_=sibling is None,
                # required to not trip over submodules that
                # were removed in the origin clone
                recurse_submodules="no",
                prune=True)  # prune to not accumulate a mess over time
            repo.fetch(**fetch_kwargs)
            # NOTE if any further acces to `repo` is needed, reevaluate
            # ds.repo again, as it might have be converted from an GitRepo
            # to an AnnexRepo
            if merge:
                for fr in _update_repo(ds, sibling_, reobtain_data):
                    yield fr
            res['status'] = 'ok'
            yield res
            save_paths.append(ap['path'])
        if recursive:
            save_paths = [p for p in save_paths if p != refds_path]
            if not save_paths:
                return
            lgr.debug(
                'Subdatasets where updated state may need to be '
                'saved in the parent dataset: %s', save_paths)
            for r in Dataset(refds_path).save(
                    path=save_paths,
                    recursive=False,
                    message='[DATALAD] Save updated subdatasets'):
                yield r
Beispiel #47
0
    def __call__(
            path=None,
            dataset=None,
            get_aggregates=False,
            reporton='all',
            recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(
                refds_path,
                check_installed=True,
                purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds,
                version=str(aggregate_layout_version),
                abspath=True
            )
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message='metadata aggregation has never been performed in this dataset')
                return
            parentds = []
            for dspath in sorted(agginfos):
                info = agginfos[dspath]
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if dspath == ds.path:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(
                    info,
                    **res_kwargs
                )
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = op.curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [ap for ap in content_by_ds[ds_path]
                         # this is an available subdataset, will be processed in another
                         # iteration
                         if ap.get('state', None) == 'absent' or
                         not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return
Beispiel #48
0
    def __call__(
            spec=None,
            dataset=None,
            discover=False,
            help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError('requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(
                dataset, check_installed=False,
                purpose='run a procedure')
        except NoDatasetArgumentFound:
            ds = None

        if discover:
            reported = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['type'] is None and ex['template'] is None:
                    # doesn't seem like a match
                    lgr.debug("Neither type nor execution template found for "
                              "%s. Ignored.", m)
                    continue
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' (missing)' if ex['state'] == 'absent' else ''
                res = get_status_dict(
                    action='discover_procedure',
                    path=m,
                    type='file',
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='ok',
                    state=ex['state'],
                    procedure_name=cmd_name,
                    procedure_type=ex['type'],
                    procedure_callfmt=ex['template'],
                    procedure_help=cmd_help,
                    message=message)
                reported.add(m)
                yield res
            return

        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            import shlex
            spec = shlex.split(spec)
        name = spec[0]
        args = spec[1:]

        try:
            # get the first match an run with it
            procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                next(_get_procedure_implementation(name, ds=ds))
        except StopIteration:
            res = get_status_dict(
                    action='run_procedure',
                    # TODO: Default renderer requires a key "path" to exist.
                    # Doesn't make a lot of sense in this case
                    path=name,
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='impossible',
                    message="Cannot find procedure with name '%s'" % name)
            yield res
            return

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='ok',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message=cmd_help)
            else:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='impossible',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message="No help available for '%s'" % name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=procedure_file,
            ds=ds.path if ds else '',
            args=u' '.join(u'"{}"'.format(a) for a in args) if args else '')
        lgr.info("Running procedure %s", name)
        lgr.debug('Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator'
        ):
            yield r
Beispiel #49
0
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
Beispiel #50
0
    def __call__(path=None, message=None, dataset=None,
                 version_tag=None,
                 recursive=False, recursion_limit=None,
                 updated=False,
                 message_file=None,
                 to_git=None,
                 ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(
                rootds, children, [], edges, includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))}
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True if not hasattr(ds.repo, 'annexstatus')
                        else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = text_type(
                                # recode path back to dataset path anchor
                                pds.pathobj / res[k].relative_to(
                                    pds.repo.pathobj)
                            )
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha()
                else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(
                    status='ok',
                    version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(
                    status='error',
                    message=('cannot tag this version: %s', e.stderr.strip()))
                yield dsres
Beispiel #51
0
    def __call__(match,
                 dataset=None,
                 search=None,
                 report=None,
                 report_matched=False,
                 format='custom',
                 regex=False):

        lgr.debug("Initiating search for match=%r and dataset %r",
                  match, dataset)
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            exc_info = sys.exc_info()
            if dataset is None:
                if not ui.is_interactive:
                    raise NoDatasetArgumentFound(
                        "No DataLad dataset found. Specify a dataset to be "
                        "searched, or run interactively to get assistance "
                        "installing a queriable superdataset."
                    )
                # none was provided so we could ask user either he possibly wants
                # to install our beautiful mega-duper-super-dataset?
                # TODO: following logic could possibly benefit other actions.
                if os.path.exists(LOCAL_CENTRAL_PATH):
                    central_ds = Dataset(LOCAL_CENTRAL_PATH)
                    if central_ds.is_installed():
                        if ui.yesno(
                            title="No DataLad dataset found at current location",
                            text="Would you like to search the DataLad "
                                 "superdataset at %r?"
                                  % LOCAL_CENTRAL_PATH):
                            pass
                        else:
                            reraise(*exc_info)
                    else:
                        raise NoDatasetArgumentFound(
                            "No DataLad dataset found at current location. "
                            "The DataLad superdataset location %r exists, "
                            "but does not contain an dataset."
                            % LOCAL_CENTRAL_PATH)
                elif ui.yesno(
                        title="No DataLad dataset found at current location",
                        text="Would you like to install the DataLad "
                             "superdataset at %r?"
                             % LOCAL_CENTRAL_PATH):
                    from datalad.api import install
                    central_ds = install(LOCAL_CENTRAL_PATH, source='///')
                    ui.message(
                        "From now on you can refer to this dataset using the "
                        "label '///'"
                    )
                else:
                    reraise(*exc_info)

                lgr.info(
                    "Performing search using DataLad superdataset %r",
                    central_ds.path
                )
                for res in central_ds.search(
                        match,
                        search=search, report=report,
                        report_matched=report_matched,
                        format=format, regex=regex):
                    yield res
                return
            else:
                raise

        cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache')
        mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL)

        meta = None
        if os.path.exists(mcache_fname):
            lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname))
            meta, checksum = pickle.load(open(mcache_fname, 'rb'))
            # TODO add more sophisticated tests to decide when the cache is no longer valid
            if checksum != ds.repo.get_hexsha():
                # errrr, try again below
                meta = None

        # don't put in 'else', as yet to be written tests above might fail and require
        # regenerating meta data
        if meta is None:
            lgr.info("Loading and caching local meta-data... might take a few seconds")
            if not exists(cache_dir):
                os.makedirs(cache_dir)

            meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False,
                                ignore_cache=False)
            # merge all info on datasets into a single dict per dataset
            meta = flatten_metadata_graph(meta)
            # extract graph, if any
            meta = meta.get('@graph', meta)
            # build simple queriable representation
            if not isinstance(meta, list):
                meta = [meta]

            # sort entries by location (if present)
            sort_keys = ('location', 'description', 'id')
            meta = sorted(meta, key=lambda m: tuple(m.get(x, "") for x in sort_keys))

            # use pickle to store the optimized graph in the cache
            pickle.dump(
                # graph plus checksum from what it was built
                (meta, ds.repo.get_hexsha()),
                open(mcache_fname, 'wb'))
            lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname))

        if report in ('', ['']):
            report = []
        elif report and not isinstance(report, list):
            report = [report]

        match = assure_list(match)
        search = assure_list(search)
        # convert all to lower case for case insensitive matching
        search = {x.lower() for x in search}

        def get_in_matcher(m):
            """Function generator to provide closure for a specific value of m"""
            mlower = m.lower()

            def matcher(s):
                return mlower in s.lower()
            return matcher

        matchers = [
            re.compile(match_).search
            if regex
            else get_in_matcher(match_)
            for match_ in match
        ]

        # location should be reported relative to current location
        # We will assume that noone chpwd while we are yielding
        ds_path_prefix = get_path_prefix(ds.path)

        # So we could provide a useful message whenever there were not a single
        # dataset with specified `--search` properties
        observed_properties = set()

        # for every meta data set
        for mds in meta:
            hit = False
            hits = [False] * len(matchers)
            matched_fields = set()
            if not mds.get('type', mds.get('schema:type', None)) == 'Dataset':
                # we are presently only dealing with datasets
                continue
            # TODO consider the possibility of nested and context/graph dicts
            # but so far we were trying to build simple lists of dicts, as much
            # as possible
            if not isinstance(mds, dict):
                raise NotImplementedError("nested meta data is not yet supported")

            # manual loop for now
            for k, v in iteritems(mds):
                if search:
                    k_lower = k.lower()
                    if k_lower not in search:
                        if observed_properties is not None:
                            # record for providing a hint later
                            observed_properties.add(k_lower)
                        continue
                    # so we have a hit, no need to track
                    observed_properties = None
                if isinstance(v, dict) or isinstance(v, list):
                    v = text_type(v)
                for imatcher, matcher in enumerate(matchers):
                    if matcher(v):
                        hits[imatcher] = True
                        matched_fields.add(k)
                if all(hits):
                    hit = True
                    # no need to do it longer than necessary
                    if not report_matched:
                        break

            if hit:
                location = mds.get('location', '.')
                report_ = matched_fields.union(report if report else {}) \
                    if report_matched else report
                if report_ == ['*']:
                    report_dict = mds
                elif report_:
                    report_dict = {k: mds[k] for k in report_ if k in mds}
                    if report_ and not report_dict:
                        lgr.debug(
                            'meta data match for %s, but no to-be-reported '
                            'properties (%s) found. Present properties: %s',
                            location, ", ".join(report_), ", ".join(sorted(mds))
                        )
                else:
                    report_dict = {}  # it was empty but not None -- asked to
                    # not report any specific field
                if isinstance(location, (list, tuple)):
                    # could be that the same dataset installed into multiple
                    # locations. For now report them separately
                    for l in location:
                        yield opj(ds_path_prefix, l), report_dict
                else:
                    yield opj(ds_path_prefix, location), report_dict

        if search and observed_properties is not None:
            import difflib
            suggestions = {
                s: difflib.get_close_matches(s, observed_properties)
                for s in search
            }
            suggestions_str = "\n ".join(
                "%s for %s" % (", ".join(choices), s)
                for s, choices in iteritems(suggestions) if choices
            )
            lgr.warning(
                "Found no properties which matched one of the one you "
                "specified (%s).  May be you meant one among: %s.\n"
                "Suggestions:\n"
                " %s",
                ", ".join(search),
                ", ".join(observed_properties),
                suggestions_str if suggestions_str.strip() else "none"
            )
Beispiel #52
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            if_dirty='save-before'):
        if dataset:
            dataset = require_dataset(
                dataset, check_installed=False, purpose='removal')
            if not dataset.is_installed() and not path:
                # all done already
                return []
            if not path:
                # act on the whole dataset if nothing else was specified
                path = dataset.path if isinstance(dataset, Dataset) else dataset
        content_by_ds, unavailable_paths = Interface._prep(
            path=path,
            dataset=dataset,
            recursive=recursive)

        nonexistent_paths = []
        for p in unavailable_paths:
            # we need to check whether any of these correspond
            # to a known subdataset, and add those to the list of
            # things to be removed
            toppath = get_dataset_root(p)
            if not toppath:
                nonexistent_paths.append(p)
                continue
            if p in Dataset(toppath).get_subdatasets(
                    recursive=False, absolute=True):
                # this is a known subdataset that needs to be removed
                pl = content_by_ds.get(p, [])
                pl.append(p)
                content_by_ds[p] = pl
        if nonexistent_paths:
            lgr.warning("ignoring non-existent path(s): %s",
                        nonexistent_paths)

        if path_is_under(content_by_ds):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        handle_dirty_datasets(
            content_by_ds, mode=if_dirty, base=dataset)
        ds2save = set()
        results = []
        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            if ds_path in paths:
                # entire dataset needs to go
                superds = ds.get_superdataset(
                    datalad_only=False,
                    topmost=False)
                res = _uninstall_dataset(ds, check=check, has_super=False)
                results.extend(res)
                if ds.path in ds2save:
                    # we just uninstalled it, no need to save anything
                    ds2save.discard(ds.path)
                if not superds:
                    continue
                subds_relpath = relpath(ds_path, start=superds.path)
                # remove submodule reference
                submodule = [sm for sm in superds.repo.repo.submodules
                             if sm.path == subds_relpath]
                # there can only be one!
                assert(len(submodule) == 1)
                submodule = submodule[0]
                submodule.remove()
                if exists(ds_path):
                    # could be an empty dir in case an already uninstalled subdataset
                    # got removed
                    os.rmdir(ds_path)
                # need to save changes to .gitmodules later
                content_by_ds[superds.path] = \
                    content_by_ds.get(superds.path, []) \
                    + [opj(superds.path, '.gitmodules'),
                       ds_path]
                ds2save.add(superds.path)
            else:
                if check and hasattr(ds.repo, 'drop'):
                    _drop_files(ds, paths, check=True)
                results.extend(ds.repo.remove(paths, r=True))
                ds2save.add(ds.path)

        if dataset and dataset.is_installed():
            # forge chain from base dataset to any leaf dataset
            # in order to save state changes all the way up
            _discover_trace_to_known(dataset.path, [], content_by_ds)

        save_dataset_hierarchy(
            content_by_ds,
            base=dataset.path if dataset and dataset.is_installed() else None,
            message='[DATALAD] removed content')
        return results
Beispiel #53
0
    def __call__(
            action='query',
            dataset=None,
            name=None,
            url=None,
            pushurl=None,
            description=None,
            # TODO consider true, for now like add_sibling
            fetch=False,
            as_common_datasrc=None,
            publish_depends=None,
            publish_by_default=None,
            annex_wanted=None,
            annex_required=None,
            annex_group=None,
            annex_groupwanted=None,
            inherit=False,
            get_annex_info=True,
            recursive=False,
            recursion_limit=None):

        # TODO: Detect malformed URL and fail?
        # XXX possibly fail if fetch is False and as_common_datasrc

        if annex_groupwanted and not annex_group:
            raise InsufficientArgumentsError(
                "To set groupwanted, you need to provide annex_group option")

        # TODO catch invalid action specified
        action_worker_map = {
            'query': _query_remotes,
            'add': _add_remote,
            'configure': _configure_remote,
            'remove': _remove_remote,
            'enable': _enable_remote,
        }
        # all worker strictly operate on a single dataset
        # anything that deals with hierarchies and/or dataset
        # relationships in general should be dealt with in here
        # at the top-level and vice versa
        worker = action_worker_map[action]

        dataset = require_dataset(dataset,
                                  check_installed=False,
                                  purpose='sibling configuration')
        refds_path = dataset.path

        res_kwargs = dict(refds=refds_path, logger=lgr)

        ds_name = basename(dataset.path)

        # do not form single list of datasets (with recursion results) to
        # give fastest possible response, for the precise of a long-all
        # function call
        ds = dataset
        for r in worker(
                # always copy signature to below to avoid bugs!
                ds,
                name,
                ds.repo.get_remotes(),
                # for top-level dataset there is no layout questions
                _mangle_urls(url, ds_name),
                _mangle_urls(pushurl, ds_name),
                fetch,
                description,
                as_common_datasrc,
                publish_depends,
                publish_by_default,
                annex_wanted,
                annex_required,
                annex_group,
                annex_groupwanted,
                inherit,
                get_annex_info,
                **res_kwargs):
            yield r
        if not recursive:
            return

        # do we have instructions to register siblings with some alternative
        # layout?
        replicate_local_structure = url and "%NAME" not in url

        for subds in dataset.subdatasets(fulfilled=True,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         result_xfm='datasets'):
            subds_name = relpath(subds.path, start=dataset.path)
            if replicate_local_structure:
                subds_url = slash_join(url, subds_name)
                subds_pushurl = slash_join(pushurl, subds_name)
            else:
                subds_url = \
                    _mangle_urls(url, '/'.join([ds_name, subds_name]))
                subds_pushurl = \
                    _mangle_urls(pushurl, '/'.join([ds_name, subds_name]))
            for r in worker(
                    # always copy signature from above to avoid bugs
                    subds,
                    name,
                    subds.repo.get_remotes(),
                    subds_url,
                    subds_pushurl,
                    fetch,
                    description,
                    as_common_datasrc,
                    publish_depends,
                    publish_by_default,
                    annex_wanted,
                    annex_required,
                    annex_group,
                    annex_groupwanted,
                    inherit,
                    get_annex_info,
                    **res_kwargs):
                yield r
Beispiel #54
0
    def __call__(
            revision="HEAD",
            since=None,
            dataset=None,
            branch=None,
            message=None,
            onto=None,
            script=None,
            report=False):

        ds = require_dataset(
            dataset, check_installed=True,
            purpose='rerunning a command')

        lgr.debug('rerunning command output underneath %s', ds)

        if script is None and not report and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

        if not ds.repo.get_hexsha():
            yield get_status_dict(
                'run', ds=ds,
                status='impossible',
                message='cannot rerun command, nothing recorded')
            return

        if branch and branch in ds.repo.get_branches():
            yield get_status_dict(
                "run", ds=ds, status="error",
                message="branch '{}' already exists".format(branch))
            return

        if not ds.repo.commit_exists(revision + "^"):
            # Only a single commit is reachable from `revision`.  In
            # this case, --since has no effect on the range construction.
            revrange = revision
        elif since is None:
            revrange = "{rev}^..{rev}".format(rev=revision)
        elif since.strip() == "":
            revrange = revision
        else:
            revrange = "{}..{}".format(since, revision)

        if ds.repo.repo.git.rev_list("--merges", revrange, "--"):
            yield get_status_dict(
                "run", ds=ds, status="error",
                message="cannot rerun history with merge commits")
            return

        results = _rerun_as_results(ds, revrange, since, branch, onto, message)
        if script:
            handler = _get_script_handler(script, since, revision)
        elif report:
            handler = _report
        else:
            handler = _rerun

        for res in handler(ds, results):
            yield res
Beispiel #55
0
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.plugin.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import assure_list

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='add README')

        filename = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=filename)

        if lexists(filename) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(filename):
            dataset.unlock(filename)

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata('.',
                                  reporton='datasets',
                                  return_type='item-or-list',
                                  on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warn("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
            ('', meta.get('description', meta.get('shortdescription', ''))),
            ('Author{}'.format(
                's' if isinstance(meta.get('author', None), list) else ''),
             u'\n'.join([
                 u'- {}'.format(a) for a in assure_list(meta.get('author', []))
             ])),
            ('Homepage', meta.get('homepage', '')),
            ('Reference', meta.get('citation', '')),
            ('License', meta.get('license', '')),
            ('Keywords', u', '.join([
                u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))
            ])),
            ('Funding', meta.get('fundedby', '')),
        ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content = u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
            title='Dataset "{}"'.format(meta['title'])
            if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
        )

        with open(filename,
                  'a' if existing == 'append' else 'w',
                  encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(status='ok',
                       path=filename,
                       type='file',
                       action='add_readme')

        for r in dataset.rev_save(filename,
                                  message='[DATALAD] added README',
                                  result_filter=None,
                                  result_xfm=None):
            yield r
Beispiel #56
0
    def __call__(dataset, filename=None, archivetype='tar', compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti
        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype,
            '{}{}'.format(
                '.' if compression else '',
                compression) if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename, default_filename) # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(
                    repo_files, allow_quick=True, batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(
                    repo_files, allow_quick=True, batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning if missing_content == 'continue' else lgr.debug)(
                                'File %s has no content available, skipped', fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' % fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(
                    fpath,
                    arcname=aname,
                    **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(
            status='ok',
            path=filename,
            type='file',
            action='export_archive',
            logger=lgr)
Beispiel #57
0
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=None):
        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = ds.path if ds else None

        # legacy compatibility
        if reckless is True:
            # so that we can forget about how things used to be
            reckless = 'auto'

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `save`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            # since this is a relative `path`, resolve it:
            # we are not going to reuse the decoded URL, as this is done for
            # all source candidates in clone_dataset(), we just use to determine
            # a destination path here in order to perform a bunch of additional
            # checks that shall not pollute the helper function
            source_ = decode_source_spec(
                source, cfg=None if ds is None else ds.config)
            path = resolve_path(source_['default_destpath'], dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        result_props = dict(
            action='install',
            logger=lgr,
            refds=refds_path,
            source_url=source)

        try:
            # this will implicitly cause pathlib to run a bunch of checks
            # whether the present path makes any sense on the platform
            # we are running on -- we don't care if the path actually
            # exists at this point, but we want to abort early if the path
            # spec is determined to be useless
            path.exists()
        except OSError as e:
            yield get_status_dict(
                status='error',
                path=path,
                message=('cannot handle target path: %s', exc_str(e)),
                **result_props)
            return

        destination_dataset = Dataset(path)
        result_props['ds'] = destination_dataset

        if ds is not None and ds.pathobj not in path.parents:
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, ds),
                **result_props)
            return

        # perform the actual cloning operation
        yield from clone_dataset(
            [source],
            destination_dataset,
            reckless,
            description,
            result_props,
            cfg=None if ds is None else ds.config,
        )

        # TODO handle any 'version' property handling and verification using a dedicated
        # public helper

        if ds is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in ds.save(
                    path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r
Beispiel #58
0
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None,
                explicit=False, message=None, sidecar=None,
                extra_info=None,
                rerun_info=None,
                extra_inputs=None,
                rerun_outputs=None,
                inject=False,
                saver=_save_outputs):
    """Run `cmd` in `dataset` and record the results.

    `Run.__call__` is a simple wrapper over this function. Aside from backward
    compatibility kludges, the only difference is that `Run.__call__` doesn't
    expose all the parameters of this function. The unexposed parameters are
    listed below.

    Parameters
    ----------
    extra_info : dict, optional
        Additional information to dump with the json run record. Any value
        given here will take precedence over the standard run key. Warning: To
        avoid collisions with future keys added by `run`, callers should try to
        use fairly specific key names and are encouraged to nest fields under a
        top-level "namespace" key (e.g., the project or extension name).
    rerun_info : dict, optional
        Record from a previous run. This is used internally by `rerun`.
    extra_inputs : list, optional
        Inputs to use in addition to those specified by `inputs`. Unlike
        `inputs`, these will not be injected into the {inputs} format field.
    rerun_outputs : list, optional
        Outputs, in addition to those in `outputs`, determined automatically
        from a previous run. This is used internally by `rerun`.
    inject : bool, optional
        Record results as if a command was run, skipping input and output
        preparation and command execution. In this mode, the caller is
        responsible for ensuring that the state of the working tree is
        appropriate for recording the command's results.
    saver : callable, optional
        Must take a dataset instance, a list of paths to save, and a
        message string as arguments and must record any changes done
        to any content matching an entry in the path list. Must yield
        result dictionaries as a generator.

    Yields
    ------
    Result records for the run.
    """
    if not cmd:
        lgr.warning("No command given")
        return

    rel_pwd = rerun_info.get('pwd') if rerun_info else None
    if rel_pwd and dataset:
        # recording is relative to the dataset
        pwd = normpath(opj(dataset.path, rel_pwd))
        rel_pwd = relpath(pwd, dataset.path)
    else:
        pwd, rel_pwd = get_command_pwds(dataset)

    ds = require_dataset(
        dataset, check_installed=True,
        purpose='tracking outcomes of a command')

    # not needed ATM
    #refds_path = ds.path

    lgr.debug('tracking command output underneath %s', ds)

    if not (rerun_info or inject):  # Rerun already takes care of this.
        # For explicit=True, we probably want to check whether any inputs have
        # modifications. However, we can't just do is_dirty(..., path=inputs)
        # because we need to consider subdatasets and untracked files.
        if not explicit and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=('unsaved modifications present, '
                         'cannot detect changes by command'))
            return

    cmd = normalize_command(cmd)

    inputs = GlobbedPaths(inputs, pwd=pwd,
                          expand=expand in ["inputs", "both"])
    extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd,
                                # Follow same expansion rules as `inputs`.
                                expand=expand in ["inputs", "both"])
    outputs = GlobbedPaths(outputs, pwd=pwd,
                           expand=expand in ["outputs", "both"])

    if not inject:
        for res in prepare_inputs(ds, inputs, extra_inputs):
            yield res

        if outputs:
            for res in _install_and_reglob(ds, outputs):
                yield res
            for res in _unlock_or_remove(ds, outputs.expand(full=True)):
                yield res

        if rerun_outputs is not None:
            # These are files we need to unlock/remove for a rerun that aren't
            # included in the explicit outputs. Unlike inputs/outputs, these are
            # full paths, so we can pass them directly to unlock.
            for res in _unlock_or_remove(ds, rerun_outputs):
                yield res
    else:
        # If an inject=True caller wants to override the exit code, they can do
        # so in extra_info.
        cmd_exitcode = 0
        exc = None

    try:
        cmd_expanded = format_command(ds, cmd,
                                      pwd=pwd,
                                      dspath=ds.path,
                                      inputs=inputs,
                                      outputs=outputs)
    except KeyError as exc:
        yield get_status_dict(
            'run',
            ds=ds,
            status='impossible',
            message=('command has an unrecognized placeholder: %s',
                     exc))
        return

    if not inject:
        cmd_exitcode, exc = _execute_command(
            cmd_expanded, pwd,
            expected_exit=rerun_info.get("exit", 0) if rerun_info else None)


    # amend commit message with `run` info:
    # - pwd if inside the dataset
    # - the command itself
    # - exit code of the command
    run_info = {
        'cmd': cmd,
        'exit': cmd_exitcode,
        'chain': rerun_info["chain"] if rerun_info else [],
        'inputs': inputs.paths,
        'extra_inputs': extra_inputs.paths,
        'outputs': outputs.paths,
    }
    if rel_pwd is not None:
        # only when inside the dataset to not leak information
        run_info['pwd'] = rel_pwd
    if ds.id:
        run_info["dsid"] = ds.id
    if extra_info:
        run_info.update(extra_info)

    record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False)

    if sidecar is None:
        use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False)
        # If ConfigManager gets the ability to say "return single value",
        # update this code to use that.
        if isinstance(use_sidecar, tuple):
            # Use same precedence as 'git config'.
            use_sidecar = use_sidecar[-1]
        use_sidecar = anything2bool(use_sidecar)
    else:
        use_sidecar = sidecar


    if use_sidecar:
        # record ID is hash of record itself
        from hashlib import md5
        record_id = md5(record.encode('utf-8')).hexdigest()
        record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo'))
        record_path = op.join(ds.path, record_dir, record_id)
        if not op.lexists(record_path):
            # go for compression, even for minimal records not much difference, despite offset cost
            # wrap in list -- there is just one record
            dump2stream([run_info], record_path, compressed=True)

    # compose commit message
    msg = u"""\
[DATALAD RUNCMD] {}

=== Do not change lines below ===
{}
^^^ Do not change lines above ^^^
"""
    msg = msg.format(
        message if message is not None else _format_cmd_shorty(cmd_expanded),
        '"{}"'.format(record_id) if use_sidecar else record)

    outputs_to_save = outputs.expand(full=True) if explicit else '.'
    if not rerun_info and cmd_exitcode:
        if outputs_to_save:
            msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo),
                                   "COMMIT_EDITMSG"))
            with open(msg_path, "wb") as ofh:
                ofh.write(assure_bytes(msg))
            lgr.info("The command had a non-zero exit code. "
                     "If this is expected, you can save the changes with "
                     "'datalad add -d . -r -F %s .'",
                     msg_path)
        raise exc
    elif outputs_to_save:
        for r in saver(ds, outputs_to_save, msg):
            yield r
Beispiel #59
0
    def __call__(path=None,
                 *,
                 sibling=None,
                 merge=False,
                 how=None,
                 how_subds=None,
                 follow="sibling",
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 fetch_all=None,
                 reobtain_data=False):
        if fetch_all is not None:
            lgr.warning(
                'update(fetch_all=...) called. Option has no effect, and will be removed'
            )
        if path and not recursive:
            lgr.warning('path constraints for subdataset updates ignored, '
                        'because `recursive` option was not given')

        how, how_subds = _process_how_args(merge, how, how_subds)
        # `merge` should be considered through `how` and `how_subds` only.
        # Unbind `merge` to ensure that downstream code doesn't look at it.
        del merge

        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='update')

        save_paths = []
        update_failures = set()
        saw_subds = False
        for ds, revision in itertools.chain(
            [(refds, None)],
                refds.subdatasets(path=path,
                                  state='present',
                                  recursive=recursive,
                                  recursion_limit=recursion_limit,
                                  return_type='generator',
                                  result_renderer='disabled',
                                  result_xfm=YieldDatasetAndRevision())
                if recursive else []):
            if ds != refds:
                saw_subds = True
            repo = ds.repo
            is_annex = isinstance(repo, AnnexRepo)
            # prepare return value
            res = get_status_dict('update',
                                  ds=ds,
                                  logger=lgr,
                                  refds=refds.path)

            follow_parent = revision and follow.startswith("parentds")
            follow_parent_lazy = revision and follow == "parentds-lazy"
            if follow_parent_lazy and \
               repo.get_hexsha(repo.get_corresponding_branch()) == revision:
                res["message"] = (
                    "Dataset already at commit registered in parent: %s",
                    repo.path)
                res["status"] = "notneeded"
                yield res
                continue

            how_curr = how_subds if revision else how
            # get all remotes which have references (would exclude
            # special remotes)
            remotes = repo.get_remotes(**({
                'exclude_special_remotes': True
            } if is_annex else {}))
            if not remotes and not sibling:
                res['message'] = (
                    "No siblings known to dataset at %s\nSkipping", repo.path)
                res['status'] = 'notneeded'
                yield res
                continue
            curr_branch = repo.get_active_branch()
            tracking_remote = None
            if not sibling and len(remotes) == 1:
                # there is only one remote, must be this one
                sibling_ = remotes[0]
            elif not sibling:
                # nothing given, look for tracking branch
                tracking_remote = repo.get_tracking_branch(branch=curr_branch,
                                                           remote_only=True)[0]
                sibling_ = tracking_remote
            else:
                sibling_ = sibling
            if sibling_ and sibling_ not in remotes:
                res['message'] = ("'%s' not known to dataset %s\nSkipping",
                                  sibling_, repo.path)
                res['status'] = 'impossible'
                yield res
                continue
            if not sibling_ and len(remotes) > 1 and how_curr:
                lgr.debug("Found multiple siblings:\n%s", remotes)
                res['status'] = 'impossible'
                res['message'] = "Multiple siblings, please specify from which to update."
                yield res
                continue
            lgr.info("Fetching updates for %s", ds)
            # fetch remote
            fetch_kwargs = dict(
                # test against user-provided value!
                remote=None if sibling is None else sibling_,
                all_=sibling is None,
                git_options=[
                    # required to not trip over submodules that were removed in
                    # the origin clone
                    "--no-recurse-submodules",
                    # prune to not accumulate a mess over time
                    "--prune"
                ])
            if not (follow_parent_lazy and repo.commit_exists(revision)):
                try:
                    repo.fetch(**fetch_kwargs)
                except CommandError as exc:
                    ce = CapturedException(exc)
                    yield get_status_dict(
                        status="error",
                        message=("Fetch failed: %s", ce),
                        exception=ce,
                        **res,
                    )
                    continue

            # NOTE reevaluate ds.repo again, as it might have be converted from
            # a GitRepo to an AnnexRepo
            repo = ds.repo

            if follow_parent and not repo.commit_exists(revision):
                if sibling_:
                    try:
                        lgr.debug("Fetching revision %s directly for %s",
                                  revision, repo)
                        repo.fetch(remote=sibling_,
                                   refspec=revision,
                                   git_options=["--recurse-submodules=no"])
                    except CommandError as exc:
                        ce = CapturedException(exc)
                        yield dict(
                            res,
                            status="impossible",
                            message=("Attempt to fetch %s from %s failed: %s",
                                     revision, sibling_, ce),
                            exception=ce)
                        continue
                else:
                    yield dict(res,
                               status="impossible",
                               message=("Need to fetch %s directly "
                                        "but single sibling not resolved",
                                        revision))
                    continue

            saw_update_failure = False
            if how_curr:
                if follow_parent:
                    target = revision
                else:
                    target = _choose_update_target(repo, curr_branch, sibling_,
                                                   tracking_remote)

                adjusted = is_annex and repo.is_managed_branch(curr_branch)
                if adjusted:
                    if follow_parent:
                        yield dict(
                            res,
                            status="impossible",
                            message=("follow='parentds' is incompatible "
                                     "with adjusted branches"))
                        continue
                    if how_curr != "merge":
                        yield dict(
                            res,
                            status="impossible",
                            message=("Updating via '%s' is incompatible "
                                     "with adjusted branches", how_curr))
                        continue

                update_fn = _choose_update_fn(repo,
                                              how_curr,
                                              is_annex=is_annex,
                                              adjusted=adjusted)

                fn_opts = ["--ff-only"] if how_curr == "ff-only" else None
                if update_fn is not _annex_sync:
                    if target is None:
                        yield dict(res,
                                   status="impossible",
                                   message="Could not determine update target")
                        continue

                if is_annex and reobtain_data:
                    update_fn = _reobtain(ds, update_fn)

                for ures in update_fn(repo, sibling_, target, opts=fn_opts):
                    # NOTE: Ideally the "merge" action would also be prefixed
                    # with "update.", but a plain "merge" is used for backward
                    # compatibility.
                    if ures["status"] != "ok" and (
                            ures["action"] == "merge"
                            or ures["action"].startswith("update.")):
                        saw_update_failure = True
                    yield dict(res, **ures)

            if saw_update_failure:
                update_failures.add(ds)
                res['status'] = 'error'
                res['message'] = ("Update of %s failed", target)
            else:
                res['status'] = 'ok'
                save_paths.append(ds.path)
            yield res
        # we need to save updated states only if merge was requested -- otherwise
        # it was a pure fetch
        if how_curr and recursive:
            yield from _save_after_update(refds, save_paths, update_failures,
                                          path, saw_subds)
Beispiel #60
0
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.plugin.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import assure_list

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='add README')

        filename = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=filename)

        if lexists(filename) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(filename):
            dataset.unlock(filename)

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata(
            '.', reporton='datasets', return_type='item-or-list',
            on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warn("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
                ('', meta.get('description', meta.get('shortdescription', ''))),
                ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''),
                    u'\n'.join([u'- {}'.format(a) for a in assure_list(meta.get('author', []))])),
                ('Homepage', meta.get('homepage', '')),
                ('Reference', meta.get('citation', '')),
                ('License', meta.get('license', '')),
                ('Keywords', u', '.join([u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))])),
                ('Funding', meta.get('fundedby', '')),
                ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content=u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
            title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
            )

        with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(
                status='ok',
                path=filename,
                type='file',
                action='add_readme')

        for r in dataset.save(
                filename,
                message='[DATALAD] added README',
                result_filter=None,
                result_xfm=None):
            yield r