Exemple #1
0
    def check(self, name, min_version=None, msg=""):
        """Check if an external (optionally of specified min version) present

        Parameters
        ----------
        name: str
          Name of the external (typically a Python module)
        min_version: str or version, optional
          Minimal version to satisfy
        msg: str, optional
          An additional message to include into the exception message

        Raises
        ------
        MissingExternalDependency
          if the external is completely missing
        OutdatedExternalDependency
          if the external is present but does not satisfy the min_version
        """
        ver_present = self[name]
        if ver_present is None:
            raise MissingExternalDependency(name, ver=min_version, msg=msg)
        elif min_version and ver_present < min_version:
            raise OutdatedExternalDependency(name,
                                             ver=min_version,
                                             ver_present=ver_present,
                                             msg=msg)
Exemple #2
0
    def __call__(self, data):
        input = data.pop(self._input) if self._pop_input else data[self._input]

        if not Response:
            # we have no scrapy
            raise MissingExternalDependency(
                "scrapy", msg="It is needed for this type of crawling")
        if isinstance(input, Response):
            selector = Selector(response=input)
            if hasattr(input, 'url') and input.url and ('url' not in data):
                # take the URL of the response object
                data = updated(data, {'url': input.url})
        else:
            selector = Selector(text=input)

        count = 0
        for entry, data_ in self._select_and_extract(selector, self.query,
                                                     data):
            data_ = updated(data_, {self._output: entry.extract()})
            # now get associated xpaths, css, etc
            for selectors_dict, entry_method in ((self._xpaths, entry.xpath),
                                                 (self._csss, entry.css)):
                if not selectors_dict:
                    continue
                for key in selectors_dict:
                    selector_ = selectors_dict[key]
                    key_extracted = entry_method(selector_).extract()
                    if not len(key_extracted):
                        # TODO: warning, make mandatory to have a hit if expected?
                        continue
                    if len(key_extracted) > 1:
                        if self._allow_multiple:
                            data_[key] = key_extracted
                            # raise NotImplementedError("Don't know what to do yet with this one")
                        else:
                            lgr.warn(
                                "Got multiple selections for xpath query %s. "
                                "Keeping only the first one: %s" %
                                (repr(selector_), key_extracted[0]))
                            data_[key] = key_extracted[0]
                    else:
                        data_[key] = key_extracted[0]
            count += 1
            yield data_

        if self._min_count and count < self._min_count:
            raise ValueError(
                "Did not match required %d matches (got %d) using %s" %
                (self._min_count, count, self))

        if self._max_count and count > self._max_count:
            raise ValueError("Matched more than %d matches (got %d) using %s" %
                             (self._max_count, count, self))
Exemple #3
0
def _parse_date(date):
    if date.startswith("@"):  # unix timestamp
        timestamp = int(date[1:])
    else:
        try:
            import dateutil.parser
        except ImportError:
            raise MissingExternalDependency(
                "python-dateutil",
                msg="This package is required to parse non-timestamp dates")

        from calendar import timegm
        # Note: datetime.timestamp isn't available in Python 2.
        try:
            timestamp = timegm(dateutil.parser.parse(date).utctimetuple())
        except TypeError as exc:
            # Make older dateutil versions return a consistent error for
            # invalid dates.
            raise ValueError(exc)
    return timestamp
Exemple #4
0
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
    def __call__(
            reponame,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            name='github',
            existing='error',
            github_login=None,
            github_passwd=None,
            github_organization=None,
            access_protocol='https',
            publish_depends=None,
            dryrun=False):
        try:
            # this is an absolute leaf package, import locally to avoid
            # unecessary dependencies
            import github as gh
        except ImportError:
            raise MissingExternalDependency(
                'PyGitHub',
                msg='GitHub-related functionality is unavailable without this package')

        # what to operate on
        ds = require_dataset(
            dataset, check_installed=True, purpose='create Github sibling')
        # gather datasets and essential info
        # dataset instance and mountpoint relative to the top
        toprocess = [(ds, '')]
        if recursive:
            for sub in ds.subdatasets(
                    fulfilled=None,  # we want to report on missing dataset in here
                    recursive=recursive,
                    recursion_limit=recursion_limit,
                    result_xfm='datasets'):
                if not sub.is_installed():
                    lgr.info('Ignoring unavailable subdataset %s', sub)
                    continue
                toprocess.append((sub, relpath(sub.path, start=ds.path)))

        # check for existing remote configuration
        filtered = []
        for d, mp in toprocess:
            if name in d.repo.get_remotes():
                if existing == 'error':
                    msg = '{} already had a configured sibling "{}"'.format(
                        d, name)
                    if dryrun:
                        lgr.error(msg)
                    else:
                        raise ValueError(msg)
                elif existing == 'skip':
                    continue
            gh_reponame = '{}{}{}'.format(
                reponame,
                '-' if mp else '',
                template_fx(mp))
            filtered.append((d, gh_reponame))

        if not filtered:
            # all skipped
            return []

        # actually make it happen on Github
        rinfo = _make_github_repos(
            gh, github_login, github_passwd, github_organization, filtered,
            existing, access_protocol, dryrun)

        # lastly configure the local datasets
        for d, url, existed in rinfo:
            if not dryrun:
                # first make sure that annex doesn't touch this one
                # but respect any existing config
                ignore_var = 'remote.{}.annex-ignore'.format(name)
                if not ignore_var in d.config:
                    d.config.add(ignore_var, 'true', where='local')
                Siblings()(
                    'configure',
                    dataset=d,
                    name=name,
                    url=url,
                    recursive=False,
                    # TODO fetch=True, maybe only if one existed already
                    publish_depends=publish_depends)

        # TODO let submodule URLs point to Github (optional)
        return rinfo
Exemple #6
0
    def __call__(sshurl,
                 *,
                 name=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None,
                 annex_group=None,
                 annex_groupwanted=None,
                 inherit=False,
                 since=None):
        if ui:
            # the webui has been moved to the deprecated extension
            try:
                from datalad_deprecated.sibling_webui \
                    import upload_web_interface
            except Exception as e:
                # we could just test for ModuleNotFoundError (which should be
                # all that would happen with PY3.6+, but be a little more robust
                # and use the pattern from duecredit
                if type(e).__name__ not in ('ImportError',
                                            'ModuleNotFoundError'):
                    lgr.error(
                        "Failed to import datalad_deprecated.sibling_webui "
                        "due to %s", str(e))
                raise RuntimeError(
                    "The DataLad web UI has been moved to an extension "
                    "package. Please install the Python package "
                    "`datalad_deprecated` to be able to deploy it.")

        # push uses '^' to annotate the previous pushed committish, and None for default
        # behavior. '' was/is (to be deprecated) used in `publish` and 'create-sibling'.
        # Alert user about the mistake
        if since == '':
            # deprecation was added prior 0.16.0
            import warnings
            warnings.warn("'since' should point to commitish or use '^'.",
                          DeprecationWarning)
            since = '^'

        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling(s)')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #
        if since and not recursive:
            raise ValueError("The use of 'since' requires 'recursive'")
        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option")
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified")
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                ce = CapturedException(exc)
                lgr.debug('%s does not know about url for %s: %s', ds, name,
                          ce)
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings")
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL" %
                    ds)
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to distinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(refds_path, super_ds.path))

        # check the login URL
        sibling_ri = RI(sshurl)
        ssh_sibling = is_ssh(sibling_ri)
        if not (ssh_sibling or isinstance(sibling_ri, PathRI)):
            raise ValueError(
                "Unsupported SSH URL or path: '{0}', "
                "use ssh://host/path, host:path or path syntax".format(sshurl))

        if not name:
            name = sibling_ri.hostname if ssh_sibling else "local"
            lgr.info("No sibling name given. Using %s'%s' as sibling name",
                     "URL hostname " if ssh_sibling else "", name)
        if since == '^':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        to_process = []
        if recursive:
            #
            # parse the base dataset to find all subdatasets that need processing
            #
            cand_ds = [
                Dataset(r['path']) for r in diff_dataset(
                    ds,
                    fr=since,
                    to='HEAD',
                    # w/o False we might not follow into new subdatasets
                    # which do not have that remote yet setup,
                    # see https://github.com/datalad/datalad/issues/6596
                    constant_refs=False,
                    # save cycles, we are only looking for datasets
                    annex=None,
                    untracked='no',
                    recursive=True,
                    datasets_only=True,
                )
                # not installed subdatasets would be 'clean' so we would skip them
                if r.get('type') == 'dataset'
                and r.get('state', None) != 'clean'
            ]
            if not since:
                # not only subdatasets
                cand_ds = [ds] + cand_ds
        else:
            # only the current ds
            cand_ds = [ds]
        # check remotes setup()
        for d in cand_ds:
            d_repo = d.repo
            if d_repo is None:
                continue
            checkds_remotes = d.repo.get_remotes()
            res = dict(
                action='create_sibling',
                path=d.path,
                type='dataset',
            )

            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(
                    ensure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    yield dict(
                        res,
                        status='error',
                        message=('unknown sibling(s) specified as publication '
                                 'dependency: %s', unknown_deps),
                    )
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                yield dict(
                    res,
                    sibling_name=name,
                    status='error' if existing == 'error' else 'notneeded',
                    message=(
                        "sibling '%s' already configured (specify alternative "
                        "name, or force reconfiguration via --existing", name),
                )
                continue
            to_process.append(res)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if ssh_sibling:
            # request ssh connection:
            lgr.info("Connecting ...")
            shell = ssh_manager.get_connection(sshurl)
        else:
            shell = _RunnerAdapter()
            sibling_ri.path = str(resolve_path(sibling_ri.path, dataset))
            if target_dir:
                target_dir = opj(sibling_ri.path, target_dir)

        if target_dir is None:
            if sibling_ri.path:
                target_dir = sibling_ri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        if not shell.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg="It's required on the {} machine to create a sibling".
                format('remote' if ssh_sibling else 'local'))

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name, current_ds, refds_path, shell, replicate_local_structure,
                sibling_ri, target_dir, target_url, target_pushurl, existing,
                shared, group, publish_depends, publish_by_default, ui,
                as_common_datasrc, annex_wanted, annex_group,
                annex_groupwanted, inherit)
            currentds_ap["sibling_name"] = name
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == refds_path and ui:
                from datalad_deprecated.sibling_webui import upload_web_interface
                lgr.info("Uploading web interface to %s", path)
                try:
                    upload_web_interface(path, shell, shared, ui)
                except CommandError as e:
                    ce = CapturedException(e)
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        ce)
                    currentds_ap['exception'] = ce
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                shell(
                    "cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || true )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                ce = CapturedException(e)
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, ce)
                currentds_ap['exception'] = ce
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap