Esempio n. 1
0
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = "git -C {} init{}".format(
            sh_quote(path),
            " --shared='{}'".format(sh_quote(shared)) if shared else '')
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(
                    "git -C {} annex init {}".format(
                        sh_quote(path),
                        sh_quote(description)
                        if description else '')
                )
            except CommandError as e:
                lgr.error("Initialization of remote git annex repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True
Esempio n. 2
0
File: wtf.py Progetto: hanke/datalad
def _describe_extensions():
    infos = {}
    from pkg_resources import iter_entry_points
    from importlib import import_module

    for e in iter_entry_points('datalad.extensions'):
        info = {}
        infos[e.name] = info
        try:
            ext = e.load()
            info['load_error'] = None
            info['description'] = ext[0]
            info['module'] = e.module_name
            mod = import_module(e.module_name, package='datalad')
            info['version'] = getattr(mod, '__version__', None)
        except Exception as e:
            info['load_error'] = exc_str(e)
            continue
        info['entrypoints'] = entry_points = {}
        for ep in ext[1]:
            ep_info = {
                'module': ep[0],
                'class': ep[1],
                'names': ep[2:],
            }
            entry_points['{}.{}'.format(*ep[:2])] = ep_info
            try:
                import_module(ep[0], package='datalad')
                ep_info['load_error'] = None
            except Exception as e:
                ep_info['load_error'] = exc_str(e)
                continue
    return infos
Esempio n. 3
0
def _describe_extensions():
    infos = {}
    from pkg_resources import iter_entry_points
    from importlib import import_module

    for e in iter_entry_points('datalad.extensions'):
        info = {}
        infos[e.name] = info
        try:
            ext = e.load()
            info['load_error'] = None
            info['description'] = ext[0]
            info['module'] = e.module_name
            mod = import_module(e.module_name, package='datalad')
            info['version'] = getattr(mod, '__version__', None)
        except Exception as e:
            info['load_error'] = exc_str(e)
            continue
        info['entrypoints'] = entry_points = {}
        for ep in ext[1]:
            ep_info = {
                'module': ep[0],
                'class': ep[1],
                'names': ep[2:],
            }
            entry_points['{}.{}'.format(*ep[:2])] = ep_info
            try:
                import_module(ep[0], package='datalad')
                ep_info['load_error'] = None
            except Exception as e:
                ep_info['load_error'] = exc_str(e)
                continue
    return infos
Esempio n. 4
0
def _handle_exception(e, bucket_name):
    """Helper to handle S3 connection exception"""
    if e.error_code == 'AccessDenied':
        raise AccessDeniedError(exc_str(e))
    else:
        raise DownloadError("Cannot connect to %s S3 bucket. Exception: %s" %
                            (bucket_name, exc_str(e)))
Esempio n. 5
0
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = "git -C {} init{}".format(
            sh_quote(path),
            " --shared='{}'".format(sh_quote(shared)) if shared else '')
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(
                    "git -C {} annex init {}".format(
                        sh_quote(path),
                        sh_quote(description)
                        if description else '')
                )
            except CommandError as e:
                lgr.error("Initialization of remote git annex repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True
Esempio n. 6
0
    def ensure_initialized(self):
        """Assures that manager is initialized - knows socket_dir, previous connections
        """
        if self._socket_dir is not None:
            return
        from datalad import cfg
        self._socket_dir = Path(cfg.obtain('datalad.locations.sockets'))
        self._socket_dir.mkdir(exist_ok=True, parents=True)
        try:
            os.chmod(str(self._socket_dir), 0o700)
        except OSError as exc:
            lgr.warning(
                "Failed to (re)set permissions on the %s. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s",
                self._socket_dir, exc_str(exc)
            )

        try:
            self._prev_connections = [p
                                      for p in self.socket_dir.iterdir()
                                      if not p.is_dir()]
        except OSError as exc:
            self._prev_connections = []
            lgr.warning(
                "Failed to list %s for existing sockets. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s",
                self._socket_dir, exc_str(exc)
            )

        lgr.log(5,
                "Found %d previous connections",
                len(self._prev_connections))
Esempio n. 7
0
    def assure_initialized(self):
        """Assures that manager is initialized - knows socket_dir, previous connections
        """
        if self._socket_dir is not None:
            return
        from ..config import ConfigManager
        from os import chmod
        cfg = ConfigManager()
        self._socket_dir = opj(cfg.obtain('datalad.locations.cache'),
                               'sockets')
        assure_dir(self._socket_dir)
        try:
            chmod(self._socket_dir, 0o700)
        except OSError as exc:
            lgr.warning(
                "Failed to (re)set permissions on the %s. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s", self._socket_dir, exc_str(exc))

        from os import listdir
        from os.path import isdir
        try:
            self._prev_connections = [
                opj(self.socket_dir, p) for p in listdir(self.socket_dir)
                if not isdir(opj(self.socket_dir, p))
            ]
        except OSError as exc:
            self._prev_connections = []
            lgr.warning(
                "Failed to list %s for existing sockets. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s", self._socket_dir, exc_str(exc))

        lgr.log(5, "Found %d previous connections",
                len(self._prev_connections))
Esempio n. 8
0
def get_bucket(conn, bucket_name):
    """A helper to get a bucket

    Parameters
    ----------
    bucket_name: str
        Name of the bucket to connect to
    """
    try:
        bucket = conn.get_bucket(bucket_name)
    except S3ResponseError as e:
        # can initially deny or error to connect to the specific bucket by name,
        # and we would need to list which buckets are available under following
        # credentials:
        lgr.debug("Cannot access bucket %s by name: %s", bucket_name,
                  exc_str(e))
        try:
            all_buckets = conn.get_all_buckets()
        except S3ResponseError as e2:
            lgr.debug("Cannot access all buckets: %s", exc_str(e2))
            _handle_exception(e, 'any (originally requested %s)' % bucket_name)
        all_bucket_names = [b.name for b in all_buckets]
        lgr.debug("Found following buckets %s", ', '.join(all_bucket_names))
        if bucket_name in all_bucket_names:
            bucket = all_buckets[all_bucket_names.index(bucket_name)]
        else:
            _handle_exception(e, bucket_name)
    return bucket
Esempio n. 9
0
def get_singularity_jobspec(cmd):
    """Extract the runscript of a singularity container used as an executable

    Parameters
    ----------
    cmd : list
      A command as an argument list.

    Returns
    -------
    None or str, None or list
      If no singularity is available, or the executable in the command is not
      a singularity image given by its path, None is return. Otherwise the
      runscript of the container is returned a string. The second value is
      None if the first is None, or a list of arguments to the runscript.
    """
    # get the path to the command's executable
    exec_path = cmd[0]

    runner = Runner()
    if not op.exists(exec_path):
        # probably a command from PATH
        return

    # this is a real file, not just a command on the path
    try:
        stdout, stderr = runner.run(
            ['singularity', '--version'],
            log_stdout=True,
            log_stderr=True,
            expect_stderr=True,
            expect_fail=True,
        )
        # TODO could be used to tailor handling to particular versions
    except CommandError as e:  # pragma: no cover
        # we do not have a singularity installation that we can handle
        # log debug, because there is no guarantee that the executable
        # actually was a singularity container
        lgr.debug('No suitable singularity version installed: %s', exc_str(e))
        return
    # we have singularity
    try:
        stdout, stderr = runner.run(
            # stringification only needed for pythons older than 3.6
            ['singularity', 'exec', exec_path, 'cat', '/singularity'],
            log_stdout=True,
            log_stderr=True,
            expect_stderr=True,
            expect_fail=True,
        )
        # TODO could be used to tailor handling to particular versions
    except CommandError as e:
        # we do not have a singularity installation that we can handle
        # log debug, because there is no guarantee that the executable
        # actually was a singularity container
        lgr.debug('%s is not a singularity image: %s', exec_path, exc_str(e))
        return
    # all but the container itself are the arguments
    return exec_path, cmd[1:]
Esempio n. 10
0
    def _visit_url(self, url, data):
        if url in self._seen:
            return
        # this is just a cruel first attempt
        lgr.debug("Visiting %s" % url)

        try:
            retry = 0
            orig_url = url
            if self._redirects_cache is not None:
                url = self._redirects_cache.get(url, url)
            while True:
                retry += 1
                if retry > 100:
                    raise DownloadError(
                        "We have followed 100 redirects already. Something is wrong!"
                    )
                try:
                    self._seen.add(url)
                    page = self._providers.fetch(url, allow_redirects=False)
                    break
                except UnhandledRedirectError as exc:
                    # since we care about tracking URL for proper full url construction
                    # we should disallow redirects and handle them manually here
                    lgr.debug("URL %s was redirected to %s" % (url, exc.url))
                    if url == exc.url:
                        raise DownloadError(
                            "Was redirected to the same url upon %s" %
                            exc_str(exc))
                    url = exc.url
                    if self._redirects_cache is not None:
                        self._redirects_cache[orig_url] = exc.url
        except DownloadError as exc:
            lgr.warning("URL %s failed to download: %s" % (url, exc_str(exc)))
            if self.failed in {None, 'skip'}:
                # TODO: config  -- crawl.failed='skip' should be a config option, for now always skipping
                return
            raise  # otherwise -- kaboom

        data_ = updated(data, zip(self._output, (page, url)))
        yield data_

        # now recurse if matchers were provided
        matchers = self._matchers
        if matchers:
            lgr.debug("Looking for more URLs at %s using %s", url, matchers)
            for matcher in (matchers if isinstance(matchers,
                                                   (list,
                                                    tuple)) else [matchers]):
                for data_matched in matcher(data_):
                    if 'url' not in data_matched:
                        lgr.warning("Got data without a url from %s" % matcher)
                        continue
                    # proxy findings
                    for data_matched_ in self._visit_url(
                            data_matched['url'], data_matched):
                        yield data_matched_
Esempio n. 11
0
def _describe_dataset(ds, sensitive):
    from datalad.interface.results import success_status_map
    from datalad.api import metadata

    try:
        infos = {
            'path': ds.path,
            'repo': ds.repo.__class__.__name__ if ds.repo else None,
        }
        if not sensitive:
            infos['metadata'] = _HIDDEN
        elif ds.id:
            ds_meta = metadata(
                dataset=ds, reporton='datasets', return_type='list',
                result_filter=lambda x: x['action'] == 'metadata' and success_status_map[x['status']] == 'success',
                result_renderer='disabled', on_failure='ignore')
            if ds_meta:
                ds_meta = [dm['metadata'] for dm in ds_meta]
                if len(ds_meta) == 1:
                    ds_meta = ds_meta.pop()
                infos['metadata'] = ds_meta
            else:
                infos['metadata'] = None
        return infos
    except InvalidGitRepositoryError as e:
        return {"invalid": exc_str(e)}
Esempio n. 12
0
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info

    if hasattr(pl, 'dist'):
        dist = pl.dist()
    else:
        # Python 3.8 removed .dist but recommended "distro" is slow, so we
        # try it only if needed
        try:
            import distro
            dist = distro.linux_distribution(full_distribution_name=False)
        except ImportError:
            lgr.info(
                "Please install 'distro' package to obtain distribution information"
            )
            dist = tuple()
        except Exception as exc:
            lgr.warning(
                "No distribution information will be provided since 'distro' "
                "fails to import/run: %s", exc_str(exc)
            )
            dist = tuple()

    return {
        'type': os.name,
        'name': pl.system(),
        'release': pl.release(),
        'version': pl.version(),
        'distribution': ' '.join([_t2s(dist),
                                  _t2s(pl.mac_ver()),
                                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length': get_max_path_length(getpwd()),
        'encoding': get_encoding_info(),
    }
Esempio n. 13
0
    def _get(self, filepath):
        if not lexists(filepath):
            return None

        # I wish I could just test using filesystem stats but that would not
        # be reliable, and also file might not even be here.
        # File might be under git, not annex so then we would need to assess size
        filestat = os.lstat(filepath)
        try:
            with disable_logger():
                info = self.annex.info(filepath, batch=True)
            size = info['size']
        except (CommandError, TypeError) as exc:
            # must be under git or a plain file
            lgr.debug(
                "File %s must be not under annex, since info failed: %s" %
                (filepath, exc_str(exc)))
            size = filestat.st_size

        # deduce mtime from the file or a content which it points to. Take the oldest (I wonder
        # if it would bite ;) XXX)
        mtime = filestat.st_mtime
        if islink(filepath):
            filepath_ = realpath(filepath)  # symlinked to
            if exists(filepath_):
                mtime_ = os.stat(filepath_).st_mtime
                mtime = min(mtime_, mtime)
        return FileStatus(size=size, mtime=mtime)
Esempio n. 14
0
def get_run_info(message):
    """Extract run information from `message`

    Parameters
    ----------
    message : str
        A commit message.

    Returns
    -------
    A tuple with the command's message and a dict with run information. Both
    these values are None if `message` doesn't have a run command.

    Raises
    ------
    A ValueError if the information in `message` is invalid.
    """
    cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \
                   r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^'
    runinfo = re.match(cmdrun_regex, message, re.MULTILINE | re.DOTALL)
    if not runinfo:
        return None, None

    rec_msg, runinfo = runinfo.groups()

    try:
        runinfo = json.loads(runinfo)
    except Exception as e:
        raise ValueError(
            'cannot rerun command, command specification is not valid JSON: '
            '%s' % exc_str(e))
    if 'cmd' not in runinfo:
        raise ValueError("Looks like a run commit but does not have a command")
    return rec_msg.rstrip(), runinfo
    def close(self, allow_fail=True, ctrl_path=None):
        """Closes all connections, known to this instance.

        Parameters
        ----------
        allow_fail: bool, optional
          If True, swallow exceptions which might be thrown during
          connection.close, and just log them at DEBUG level
        ctrl_path: str, Path, or list of str or Path, optional
          If specified, only the path(s) provided would be considered
        """
        if self._connections:
            ctrl_paths = [Path(p) for p in ensure_list(ctrl_path)]
            to_close = [
                c for c in self._connections
                # don't close if connection wasn't opened by SSHManager
                if self._connections[c].ctrl_path not in self._prev_connections
                and self._connections[c].ctrl_path.exists() and
                (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths
                 )
            ]
            if to_close:
                lgr.debug("Closing %d SSH connections..." % len(to_close))
            for cnct in to_close:
                f = self._connections[cnct].close
                if allow_fail:
                    f()
                else:
                    try:
                        f()
                    except Exception as exc:
                        lgr.debug("Failed to close a connection: "
                                  "%s", exc_str(exc))
            self._connections = dict()
Esempio n. 16
0
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None):
    content_by_ds = {}
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return content_by_ds
    # loop over submodules not subdatasets to get the url right away
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.repo.get_submodules():
        subds = Dataset(opj(ds.path, sub.path))
        if start is not None and not subds.path.startswith(_with_sep(start)):
            # this one we can ignore, not underneath the start path
            continue
        if not subds.is_installed():
            try:
                lgr.info("Installing subdataset %s", subds.path)
                subds = _install_subds_from_flexible_source(
                    ds, sub.path, sub.url, reckless)
                # we want the entire thing, but mark this subdataset
                # as automatically installed
                content_by_ds[subds.path] = [curdir]
            except Exception as e:
                # skip, if we didn't manage to install subdataset
                lgr.warning(
                    "Installation of subdatasets %s failed, skipped", subds)
                lgr.debug("Installation attempt failed with exception: %s",
                          exc_str(e))
                continue
            # otherwise recurse
            # we can skip the start expression, we know we are within
            content_by_ds.update(_recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless
            ))
    return content_by_ds
Esempio n. 17
0
def _install_subds_from_flexible_source(ds, sm_path, sm_url, reckless):
    """Tries to obtain a given subdataset from several meaningful locations"""
    # compose a list of candidate clone URLs
    clone_urls = _get_flexible_source_candidates_for_submodule(
        ds, sm_path, sm_url)

    # now loop over all candidates and try to clone
    subds = Dataset(opj(ds.path, sm_path))
    try:
        clone_url = _clone_from_any_source(clone_urls, subds.path)
    except GitCommandError as e:
        raise InstallFailedError(
            msg="Failed to install %s from %s (%s)" % (
                subds, clone_urls, exc_str(e))
            )
    # do fancy update
    if sm_path in ds.get_subdatasets(absolute=False, recursive=False):
        lgr.debug("Update cloned subdataset {0} in parent".format(subds))
        ds.repo.update_submodule(sm_path, init=True)
    else:
        # submodule is brand-new and previously unknown
        ds.repo.add_submodule(sm_path, url=clone_url)
    _fixup_submodule_dotgit_setup(ds, sm_path)
    _handle_possible_annex_dataset(subds, reckless)
    return subds
Esempio n. 18
0
def _revrange_as_results(dset, revrange):
    ds_repo = dset.repo
    rev_lines = ds_repo.get_revisions(revrange,
                                      fmt="%H %P",
                                      options=["--reverse", "--topo-order"])
    if not rev_lines:
        return

    for rev_line in rev_lines:
        # The strip() below is necessary because, with the format above, a
        # commit without any parent has a trailing space. (We could also use a
        # custom `rev-list --parents ...` call to avoid this.)
        fields = rev_line.strip().split(" ")
        rev, parents = fields[0], fields[1:]
        res = get_status_dict("run", ds=dset, commit=rev, parents=parents)
        full_msg = ds_repo.format_commit("%B", rev)
        try:
            msg, info = get_run_info(dset, full_msg)
        except ValueError as exc:
            # Recast the error so the message includes the revision.
            raise ValueError("Error on {}'s message: {}".format(
                rev, exc_str(exc)))

        if info is not None:
            if len(parents) != 1:
                lgr.warning(
                    "%s has run information but is a %s commit; "
                    "it will not be re-executed", rev,
                    "merge" if len(parents) > 1 else "root")
                continue
            res["run_info"] = info
            res["run_message"] = msg
        yield dict(res, status="ok")
Esempio n. 19
0
File: wtf.py Progetto: hanke/datalad
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import os
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl-len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
Esempio n. 20
0
File: wtf.py Progetto: ypid/datalad
def _describe_annex():
    from datalad.cmd import (
        GitWitlessRunner,
        StdOutErrCapture,
    )

    runner = GitWitlessRunner()
    try:
        out = runner.run(['git', 'annex', 'version'],
                         protocol=StdOutErrCapture)
    except CommandError as e:
        return dict(
            version='not available',
            message=exc_str(e),
        )
    info = {}
    for line in out['stdout'].split(os.linesep):
        key = line.split(':')[0]
        if not key:
            continue
        value = line[len(key) + 2:].strip()
        key = key.replace('git-annex ', '')
        if key.endswith('s'):
            value = value.split()
        info[key] = value
    return info
Esempio n. 21
0
    def _has_active_postupdate(ds, name, ssh):
        """Figure out either has active post-update hook

        Returns
        -------
        bool or None
          None if something went wrong and we could not figure out
        """
        has_active_post_update = None
        try:
            # TODO -- we might need to expanduser taking .user into account
            # but then it must be done also on remote side
            out = CreateSibling._run_on_ds_ssh_remote(
                ds, name, ssh,
                'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no'
            )
            out = out.strip()
            assert out in ('yes', 'no')
            has_active_post_update = out == "yes"
        except CommandError as e:
            lgr.debug(
                "Could not figure out either %s on remote %s has active "
                "post_update hook due to %s",
                ds, name, exc_str(e)
            )
        return has_active_post_update
Esempio n. 22
0
    def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs):
        # get a handle on the relevant plugin module
        import datalad.export as export_mod
        try:
            exmod = import_module('.%s' % (astype, ),
                                  package=export_mod.__package__)
        except ImportError as e:
            raise ValueError("cannot load exporter '{}': {}".format(
                astype, exc_str(e)))
        if getcmdhelp:
            # no result, but return the module to make the renderer do the rest
            return (exmod, None)

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='exporting')
        # call the plugin, either with the argv array from the cmdline call
        # or directly with the kwargs
        if 'datalad_unparsed_args' in kwargs:
            result = exmod._datalad_export_plugin_call(
                ds, argv=kwargs['datalad_unparsed_args'], output=output)
        else:
            result = exmod._datalad_export_plugin_call(ds,
                                                       output=output,
                                                       **kwargs)
        return (exmod, result)
Esempio n. 23
0
def _generate_extension_api():
    """Auto detect all available extensions and generate an API from them
    """
    from importlib import import_module
    from pkg_resources import iter_entry_points
    from .interface.base import get_api_name

    from datalad.dochelpers import exc_str
    import logging
    lgr = logging.getLogger('datalad.api')

    for entry_point in iter_entry_points('datalad.extensions'):
        try:
            lgr.debug(
                'Loading entrypoint %s from datalad.extensions for API building',
                entry_point.name)
            grp_descr, interfaces = entry_point.load()
            lgr.debug(
                'Loaded entrypoint %s from datalad.extensions',
                entry_point.name)
        except Exception as e:
            lgr.warning('Failed to load entrypoint %s: %s', entry_point.name, exc_str(e))
            continue

        for intfspec in interfaces:
            # turn the interface spec into an instance
            mod = import_module(intfspec[0])
            intf = getattr(mod, intfspec[1])
            api_name = get_api_name(intfspec)
            if api_name in globals():
                lgr.debug(
                    'Command %s from extension %s is replacing a previously loaded implementation',
                    api_name,
                    entry_point.name)
            globals()[api_name] = intf.__call__
Esempio n. 24
0
def check_compress_file(ext, annex, path, name):
    # we base the archive name on the filename, in order to also
    # be able to properly test compressors where the corresponding
    # archive format has no capability of storing a filename
    # (i.e. where the archive name itself determines the filename
    # of the decompressed file, like .xz)
    archive = op.join(name, _filename + ext)
    compress_files([_filename], archive,
                   path=path)
    assert_true(op.exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    ok_file_has_content(_filepath, 'content')
Esempio n. 25
0
File: wtf.py Progetto: ypid/datalad
def _describe_system():
    import platform as pl
    from datalad import get_encoding_info
    from datalad.utils import get_linux_distribution
    try:
        dist = get_linux_distribution()
    except Exception as exc:
        lgr.warning("Failed to get distribution information: %s", exc_str(exc))
        dist = tuple()

    return {
        'type':
        os.name,
        'name':
        pl.system(),
        'release':
        pl.release(),
        'version':
        pl.version(),
        'distribution':
        ' '.join([_t2s(dist),
                  _t2s(pl.mac_ver()),
                  _t2s(pl.win32_ver())]).rstrip(),
        'max_path_length':
        get_max_path_length(getpwd()),
        'encoding':
        get_encoding_info(),
    }
Esempio n. 26
0
    def _transfer(self, cmd, key, path):

        akeys_tried = []
        # the same file could come from multiple files within the same archive
        # So far it doesn't make sense to "try all" of them since if one fails
        # it means the others would fail too, so it makes sense to immediately
        # prune the list so we keep only the ones from unique akeys.
        # May be whenever we support extraction directly from the tarballs
        # we should go through all and choose the one easiest to get or smth.
        for akey, afile in self._gen_akey_afiles(key,
                                                 sorted=True,
                                                 unique_akeys=True):
            akeys_tried.append(akey)
            try:
                akey_fpath = self.get_contentlocation(akey)
                if not akey_fpath:
                    # TODO: make it more stringent?
                    # Command could have fail to run if key was not present locally yet
                    # Thus retrieve the key using annex
                    # TODO: we need to report user somehow about this happening and progress on the download
                    self.runner(["git-annex", "get", "--key", akey],
                                cwd=self.path,
                                expect_stderr=True)

                    akey_fpath = self.get_contentlocation(akey)
                    if not akey_fpath:
                        raise RuntimeError(
                            "We were reported to fetch it alright but now can't get its location.  Check logic"
                        )

                akey_path = opj(self.repo.path, akey_fpath)
                assert exists(
                    akey_path), "Key file %s is not present" % akey_path

                # Extract that bloody file from the bloody archive
                # TODO: implement/use caching, for now a simple one
                #  actually patool doesn't support extraction of a single file
                #  https://github.com/wummel/patool/issues/20
                # so
                pwd = getpwd()
                lgr.debug(
                    "Getting file {afile} from {akey_path} while PWD={pwd}".
                    format(**locals()))
                apath = self.cache[akey_path].get_extracted_file(afile)
                link_file_load(apath, path)
                self.send('TRANSFER-SUCCESS', cmd, key)
                return
            except Exception as exc:
                # from celery.contrib import rdb
                # rdb.set_trace()
                from datalad.dochelpers import exc_str
                exc_ = exc_str(exc)
                self.debug(
                    "Failed to fetch {akey} containing {key}: {exc_}".format(
                        **locals()))
                continue

        self.error(
            "Failed to fetch any archive containing {key}. Tried: {akeys}".
            format(**locals()))
Esempio n. 27
0
 def __contains__(self, url):
     try:
         return self._get_provider(url) in self._cookies_db
     except Exception as exc:
         lgr.warning("Failed to check for having a cookie for %s: %s",
                     url, exc_str(exc))
         return None
Esempio n. 28
0
def _read(stream, input_type):
    if input_type in ["csv", "tsv"]:
        import csv
        csvrows = csv.reader(stream,
                             delimiter="\t" if input_type == "tsv" else ",")
        try:
            headers = next(csvrows)
        except StopIteration:
            raise ValueError("Failed to read {} rows from {}".format(
                input_type.upper(), stream))
        lgr.debug("Taking %s fields from first line as headers: %s",
                  len(headers), headers)
        idx_map = dict(enumerate(headers))
        rows = [dict(zip(headers, r)) for r in csvrows]
    elif input_type == "json":
        import json
        try:
            rows = json.load(stream)
        except json.decoder.JSONDecodeError as e:
            raise ValueError("Failed to read JSON from stream {}: {}".format(
                stream, exc_str(e)))
        # For json input, we do not support indexing by position,
        # only names.
        idx_map = {}
    else:
        raise ValueError("input_type {} is invalid. Known values: {}".format(
            input_type, ", ".join(INPUT_TYPES)))
    return rows, idx_map
Esempio n. 29
0
def get_commit_runinfo(repo, commit="HEAD"):
    """Return message and run record from a commit message

    If none found - returns None, None; if anything goes wrong - throws
    ValueError with the message describing the issue
    """
    commit_msg = repo.repo.git.show(commit, "--format=%s%n%n%b", "--no-patch")
    cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \
                   r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^'
    runinfo = re.match(cmdrun_regex, commit_msg, re.MULTILINE | re.DOTALL)
    if not runinfo:
        return None, None

    rec_msg, runinfo = runinfo.groups()

    try:
        runinfo = json.loads(runinfo)
    except Exception as e:
        raise ValueError(
            'cannot rerun command, command specification is not valid JSON: '
            '%s' % exc_str(e))
    if 'cmd' not in runinfo:
        raise ValueError(
            "{} looks like a run commit but does not have a command".format(
                repo.repo.git.rev_parse("--short", commit)))
    return rec_msg, runinfo
Esempio n. 30
0
 def __getitem__(self, url):
     try:
         return self._cookies_db[self._get_provider(url)]
     except Exception as exc:
         lgr.warning("Failed to get a cookie for %s: %s",
                     url, exc_str(exc))
         return None
Esempio n. 31
0
 def __contains__(self, url):
     try:
         return self._get_provider(url) in self.cookies_db
     except Exception as exc:
         lgr.warning("Failed to check for having a cookie for %s: %s", url,
                     exc_str(exc))
         return None
Esempio n. 32
0
def _generate_extension_api():
    """Auto detect all available extensions and generate an API from them
    """
    from importlib import import_module
    from pkg_resources import iter_entry_points
    from .interface.base import get_api_name

    from datalad.dochelpers import exc_str
    import logging
    lgr = logging.getLogger('datalad.api')

    for entry_point in iter_entry_points('datalad.extensions'):
        try:
            lgr.debug(
                'Loading entrypoint %s from datalad.extensions for API building',
                entry_point.name)
            grp_descr, interfaces = entry_point.load()
            lgr.debug('Loaded entrypoint %s from datalad.extensions',
                      entry_point.name)
        except Exception as e:
            lgr.warning('Failed to load entrypoint %s: %s', entry_point.name,
                        exc_str(e))
            continue

        for intfspec in interfaces:
            # turn the interface spec into an instance
            mod = import_module(intfspec[0])
            intf = getattr(mod, intfspec[1])
            api_name = get_api_name(intfspec)
            if api_name in globals():
                lgr.debug(
                    'Command %s from extension %s is replacing a previously loaded implementation',
                    api_name, entry_point.name)
            globals()[api_name] = intf.__call__
Esempio n. 33
0
    def close(self, allow_fail=True):
        """Closes all connections, known to this instance.

        Parameters
        ----------
        allow_fail: bool, optional
          If True, swallow exceptions which might be thrown during
          connection.close, and just log them at DEBUG level
        """
        if self._connections:
            to_close = [c for c in self._connections
                        # don't close if connection wasn't opened by SSHManager
                        if self._connections[c].ctrl_path
                        not in self._prev_connections and
                        exists(self._connections[c].ctrl_path)]
            if to_close:
                lgr.debug("Closing %d SSH connections..." % len(to_close))
            for cnct in to_close:
                f = self._connections[cnct].close
                if allow_fail:
                    f()
                else:
                    try:
                        f()
                    except Exception as exc:
                        lgr.debug("Failed to close a connection: "
                                  "%s", exc_str(exc))
            self._connections = dict()
Esempio n. 34
0
File: wtf.py Progetto: ypid/datalad
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1, 100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl - len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
Esempio n. 35
0
    def close(self, allow_fail=True, ctrl_path=None):
        """Closes all connections, known to this instance.

        Parameters
        ----------
        allow_fail: bool, optional
          If True, swallow exceptions which might be thrown during
          connection.close, and just log them at DEBUG level
        ctrl_path: str or list of str, optional
          If specified, only the path(s) provided would be considered
        """
        if self._connections:
            from datalad.utils import assure_list
            ctrl_paths = assure_list(ctrl_path)
            to_close = [c for c in self._connections
                        # don't close if connection wasn't opened by SSHManager
                        if self._connections[c].ctrl_path
                        not in self._prev_connections and
                        exists(self._connections[c].ctrl_path)
                        and (not ctrl_paths
                             or self._connections[c].ctrl_path in ctrl_paths)]
            if to_close:
                lgr.debug("Closing %d SSH connections..." % len(to_close))
            for cnct in to_close:
                f = self._connections[cnct].close
                if allow_fail:
                    f()
                else:
                    try:
                        f()
                    except Exception as exc:
                        lgr.debug("Failed to close a connection: "
                                  "%s", exc_str(exc))
            self._connections = dict()
Esempio n. 36
0
def _read(stream, input_type):
    if input_type == "csv":
        import csv
        csvrows = csv.reader(stream)
        try:
            headers = next(csvrows)
        except StopIteration:
            raise ValueError("Failed to read CSV rows from {}".format(stream))
        lgr.debug("Taking %s fields from first line as headers: %s",
                  len(headers), headers)
        idx_map = dict(enumerate(headers))
        rows = [dict(zip(headers, r)) for r in csvrows]
    elif input_type == "json":
        import json
        try:
            rows = json.load(stream)
        except json.decoder.JSONDecodeError as e:
            raise ValueError("Failed to read JSON from stream {}: {}".format(
                stream, exc_str(e)))
        # For json input, we do not support indexing by position,
        # only names.
        idx_map = {}
    else:
        raise ValueError("input_type must be 'csv', 'json', or 'ext'")
    return rows, idx_map
Esempio n. 37
0
File: wtf.py Progetto: ypid/datalad
def _describe_dataset(ds, sensitive):
    from datalad.interface.results import success_status_map
    from datalad.api import metadata

    try:
        infos = {
            'path': ds.path,
            'repo': ds.repo.__class__.__name__ if ds.repo else None,
            'id': ds.id,
        }
        if not sensitive:
            infos['metadata'] = _HIDDEN
        elif ds.id:
            ds_meta = metadata(
                dataset=ds,
                reporton='datasets',
                return_type='list',
                result_filter=lambda x: x['action'] == 'metadata' and
                success_status_map[x['status']] == 'success',
                result_renderer='disabled',
                on_failure='ignore')
            if ds_meta:
                ds_meta = [dm['metadata'] for dm in ds_meta]
                if len(ds_meta) == 1:
                    ds_meta = ds_meta.pop()
                infos['metadata'] = ds_meta
            else:
                infos['metadata'] = None
        return infos
    except InvalidGitRepositoryError as e:
        return {"invalid": exc_str(e)}
Esempio n. 38
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        contentmeta = []
        for f in self.paths:
            fpath = opj(self.ds.path, f)
            try:
                img = Image.open(fpath)
            except Exception as e:
                lgr.debug("Image metadata parser failed to load %s: %s",
                          fpath, exc_str(e))
                continue
            meta = {
                'type': 'dctype:Image',
            }

            # run all extractors
            meta.update({k: v(img) for k, v in self._extractors.items()})
            # filter useless fields (empty strings and NaNs)
            meta = {k: v for k, v in meta.items()
                    if not (hasattr(v, '__len__') and not len(v))}
            contentmeta.append((f, meta))

        return {
            '@context': vocabulary,
        }, \
            contentmeta
Esempio n. 39
0
    def _load(self):
        if self._cookies_db is not None:
            return
        if self._filename:
            filename = self._filename
            cookies_dir = os.path.dirname(filename)
        else:
            cookies_dir = os.path.join(
                appdirs.user_config_dir(),
                'datalad')  # FIXME prolly shouldn't hardcode 'datalad'
            filename = os.path.join(cookies_dir, 'cookies')

        # TODO: guarantee restricted permissions

        if not os.path.exists(cookies_dir):
            os.makedirs(cookies_dir)

        lgr.debug("Opening cookies DB %s", filename)
        try:
            self._cookies_db = shelve.open(filename,
                                           writeback=True,
                                           protocol=2)
        except Exception as exc:
            lgr.warning("Failed to open cookies DB %s: %s", filename,
                        exc_str(exc))
Esempio n. 40
0
def import_modules(modnames,
                   pkg,
                   msg="Failed to import {module}",
                   log=lgr.debug):
    """Helper to import a list of modules without failing if N/A

    Parameters
    ----------
    modnames: list of str
      List of module names to import
    pkg: str
      Package under which to import
    msg: str, optional
      Message template for .format() to log at DEBUG level if import fails.
      Keys {module} and {package} will be provided and ': {exception}' appended
    log: callable, optional
      Logger call to use for logging messages
    """
    from importlib import import_module
    _globals = globals()
    mods_loaded = []
    for modname in modnames:
        try:
            _globals[modname] = mod = import_module('.{}'.format(modname), pkg)
            mods_loaded.append(mod)
        except Exception as exc:
            from datalad.dochelpers import exc_str
            log((msg + ': {exception}').format(module=modname,
                                               package=pkg,
                                               exception=exc_str(exc)))
    return mods_loaded
Esempio n. 41
0
def check_crawl_autoaddtext(gz, ind, topurl, outd):
    ds = create(outd)
    ds.run_procedure("cfg_text2git")
    with chpwd(outd):  # TODO -- dataset argument
        template_kwargs = {
            'url': topurl,
            'a_href_match_': '.*',
        }
        if gz:
            template_kwargs['archives_re'] = "\.gz$"
        crawl_init(template_kwargs, save=True, template='simple_with_archives')
        try:
            crawl()
        except MissingExternalDependency as exc:
            raise SkipTest(exc_str(exc))
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)

    if 'compressed.dat.gz' in TEST_TREE2:
        if gz:
            ok_file_under_git(outd, "compressed.dat", annexed=False)
            ok_file_has_content(op.join(outd, "compressed.dat"),
                                u"мама мыла раму")
        else:
            ok_file_under_git(outd, "compressed.dat.gz", annexed=True)
    else:
        raise SkipTest(
            "Need datalad >= 0.11.2 to test .gz files decompression")
Esempio n. 42
0
File: get.py Progetto: hanke/datalad
def _install_necessary_subdatasets(
        ds, path, reckless, refds_path, description=None):
    """Installs subdatasets of `ds`, that are necessary to obtain in order
    to have access to `path`.

    Gets the subdataset containing `path` regardless of whether or not it was
    already installed. While doing so, installs everything necessary in between
    the uppermost installed one and `path`.

    Note: `ds` itself has to be installed.

    Parameters
    ----------
    ds: Dataset
    path: str
    reckless: bool
    """
    # figuring out what dataset to start with, --contains limits --recursive
    # to visit only subdataset on the trajectory to the target path
    subds_trail = ds.subdatasets(contains=path, recursive=True)
    if not subds_trail:
        # there is not a single known subdataset (installed or not)
        # for this path -- job done
        return
    # otherwise we start with the one deepest down
    cur_subds = subds_trail[-1]

    while not GitRepo.is_valid_repo(cur_subds['path']):
        # install using helper that give some flexibility regarding where to
        # get the module from
        try:
            sd = _install_subds_from_flexible_source(
                Dataset(cur_subds['parentds']),
                relpath(cur_subds['path'], start=cur_subds['parentds']),
                cur_subds['gitmodule_url'],
                reckless,
                description=description)
        except Exception as e:
            # skip all of downstairs, if we didn't manage to install subdataset
            yield get_status_dict(
                'install', path=cur_subds['path'], type='dataset',
                status='error', logger=lgr, refds=refds_path,
                message=("Installation of subdatasets %s failed with exception: %s",
                         cur_subds['path'], exc_str(e)))
            return

        # report installation, whether it helped or not
        yield get_status_dict(
            'install', ds=sd, status='ok', logger=lgr, refds=refds_path,
            message=("Installed subdataset in order to get %s", path))

        # now check whether the just installed subds brought us any closer to
        # the target path
        subds_trail = sd.subdatasets(contains=path, recursive=False)
        if not subds_trail:
            # no (newly available) subdataset get's us any closer
            return
        # next round
        cur_subds = subds_trail[-1]
Esempio n. 43
0
def _handle_exception(e, bucket_name):
    """Helper to handle S3 connection exception"""
    raise (
        AccessDeniedError
        if e.error_code == 'AccessDenied'
        else DownloadError)(
            "Cannot connect to %s S3 bucket. Exception: %s"
            % (bucket_name, exc_str(e))
        )
Esempio n. 44
0
def _clone_from_any_source(sources, dest):
    # should not be the case, but we need to distinguish between failure
    # of git-clone, due to existing target and an unsuccessful clone
    # attempt. See below.
    existed = dest and exists(dest)
    for source_ in sources:
        try:
            lgr.debug("Retrieving a dataset from URL: "
                      "{0}".format(source_))
            with swallow_logs():
                GitRepo.clone(path=dest, url=source_, create=True)
            return source_  # do not bother with other sources if succeeded
        except GitCommandError as e:
            lgr.debug("Failed to retrieve from URL: "
                      "{0}".format(source_))
            if not existed and dest \
                    and exists(dest):
                lgr.debug("Wiping out unsuccessful clone attempt at "
                          "{}".format(dest))
                rmtree(dest)

            if source_ == sources[-1]:
                # Note: The following block is evaluated whenever we
                # fail even with the last try. Not nice, but currently
                # necessary until we get a more precise exception:
                ####################################
                # TODO: We may want to introduce a --force option to
                # overwrite the target.
                # TODO: Currently assuming if `existed` and there is a
                # GitCommandError means that these both things are connected.
                # Need newer GitPython to get stderr from GitCommandError
                # (already fixed within GitPython.)
                if existed:
                    # rudimentary check for an installed dataset at target:
                    # (TODO: eventually check for being the one, that this
                    # is about)
                    dest_ds = Dataset(dest)
                    if dest_ds.is_installed():
                        lgr.info("{0} appears to be installed already."
                                 "".format(dest_ds))
                        break
                    else:
                        lgr.warning("Target {0} already exists and is not "
                                    "an installed dataset. Skipped."
                                    "".format(dest))
                        # Keep original in debug output:
                        lgr.debug("Original failure:{0}"
                                  "{1}".format(linesep, exc_str(e)))
                        return None
                ##################

                # Re-raise if failed even with the last candidate
                lgr.debug("Unable to establish repository instance at "
                          "{0} from {1}"
                          "".format(dest, sources))
                raise
Esempio n. 45
0
def test_GitRepo_gitpy_injection(path, path2):

    gr = GitRepo(path, create=True)
    gr._GIT_COMMON_OPTIONS.extend(['test-option'])

    with assert_raises(GitCommandError) as cme:
        gr.repo.git.unknown_git_command()
    assert_in('test-option', exc_str(cme.exception))

    # once set, these option should be persistent across git calls:
    with assert_raises(GitCommandError) as cme:
        gr.repo.git.another_unknown_git_command()
    assert_in('test-option', exc_str(cme.exception))

    # but other repos should not be affected:
    gr2 = GitRepo(path2, create=True)
    with assert_raises(GitCommandError) as cme:
        gr2.repo.git.unknown_git_command()
    assert_not_in('test-option', exc_str(cme.exception))
Esempio n. 46
0
def test_wtf(path):
    # smoke test for now
    with swallow_outputs() as cmo:
        wtf(dataset=path)
        assert_not_in('## dataset', cmo.out)
        assert_in('## configuration', cmo.out)
        # Those sections get sensored out by default now
        assert_not_in('user.name: ', cmo.out)
    with chpwd(path):
        with swallow_outputs() as cmo:
            wtf()
            assert_not_in('## dataset', cmo.out)
            assert_in('## configuration', cmo.out)
    # now with a dataset
    ds = create(path)
    with swallow_outputs() as cmo:
        wtf(dataset=ds.path)
        assert_in('## configuration', cmo.out)
        assert_in('## dataset', cmo.out)
        assert_in('path: {}'.format(ds.path), cmo.out)

    # and if we run with all sensitive
    for sensitive in ('some', True):
        with swallow_outputs() as cmo:
            wtf(dataset=ds.path, sensitive=sensitive)
            # we fake those for tests anyways, but we do show cfg in this mode
            # and explicitly not showing them
            assert_in('user.name: %s' % _HIDDEN, cmo.out)

    with swallow_outputs() as cmo:
        wtf(dataset=ds.path, sensitive='all')
        assert_not_in(_HIDDEN, cmo.out)  # all is shown
        assert_in('user.name: ', cmo.out)

    skip_if_no_module('pyperclip')

    # verify that it works correctly in the env/platform
    import pyperclip
    with swallow_outputs() as cmo:
        try:
            pyperclip.copy("xxx")
            pyperclip_works = pyperclip.paste().strip() == "xxx"
            wtf(dataset=ds.path, clipboard=True)
        except (AttributeError, pyperclip.PyperclipException) as exc:
            # AttributeError could come from pyperclip if no DISPLAY
            raise SkipTest(exc_str(exc))
        assert_in("WTF information of length", cmo.out)
        assert_not_in('user.name', cmo.out)
        if not pyperclip_works:
            # Some times does not throw but just fails to work
            raise SkipTest(
                "Pyperclip seems to be not functioning here correctly")
        assert_not_in('user.name', pyperclip.paste())
        assert_in(_HIDDEN, pyperclip.paste())  # by default no sensitive info
        assert_in("cmd:annex:", pyperclip.paste())  # but the content is there
Esempio n. 47
0
def _get_github_entity(gh, cred, github_user, github_passwd, github_organization):
    # figure out authentication
    if not (github_user and github_passwd):
        # access to the system secrets
        if github_user:
            # check that they keystore knows about this user
            if github_user != cred.get('user', github_user):
                # there is a mismatch, we need to ask
                creds = cred.enter_new()
                github_user = creds['user']
                github_passwd = creds['password']

        # if a user is provided, go with it, don't even ask any store
        if github_user is None and not cred.is_known:
            # let's figure out authentication
            if github_user is None:
                # check if there is an oauth token from
                # https://github.com/sociomantic/git-hub
                github_user = cfg.get('hub.oauthtoken', None)

        if github_user is None:
            # still nothing, ask if necessary
            creds = cred()
            github_user = creds['user']
            github_passwd = creds['password']

    if not github_user:
        raise gh.BadCredentialsException(403, 'no user specified')

    # this will always succeed, but it might later throw an exception
    # if the credentials were wrong
    # XXX make sure to wipe out known credentials if that happens
    authed_gh = gh.Github(
        github_user,
        password=github_passwd)

    try:
        if github_organization:
            try:
                entity = authed_gh.get_organization(github_organization)
            except gh.UnknownObjectException as e:
                raise ValueError('unknown organization "{}" [{}]'.format(
                                 github_organization,
                                 exc_str(e)))
        else:
            entity = authed_gh.get_user()
    except gh.BadCredentialsException as e:
        # things blew up, wipe out cred store, if anything is in it
        if cred.is_known:
            cred.delete()
        raise e

    return entity
Esempio n. 48
0
 def get_git_version(self):
     key = 'cmd:git'
     if key in self._remote_props:
         return self._remote_props[key]
     git_version = None
     try:
         git_version = self('git version')[0].split()[2]
     except CommandError as e:
         lgr.debug('Failed to determine Git version: %s',
                   exc_str(e))
     self._remote_props[key] = git_version
     return git_version
Esempio n. 49
0
    def assure_initialized(self):
        """Assures that manager is initialized - knows socket_dir, previous connections
        """
        if self._socket_dir is not None:
            return
        from ..config import ConfigManager
        from os import chmod
        cfg = ConfigManager()
        self._socket_dir = opj(cfg.obtain('datalad.locations.cache'),
                               'sockets')
        assure_dir(self._socket_dir)
        try:
            chmod(self._socket_dir, 0o700)
        except OSError as exc:
            lgr.warning(
                "Failed to (re)set permissions on the %s. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s",
                self._socket_dir, exc_str(exc)
            )

        from os import listdir
        from os.path import isdir
        try:
            self._prev_connections = [opj(self.socket_dir, p)
                                      for p in listdir(self.socket_dir)
                                      if not isdir(opj(self.socket_dir, p))]
        except OSError as exc:
            self._prev_connections = []
            lgr.warning(
                "Failed to list %s for existing sockets. "
                "Most likely future communications would be impaired or fail. "
                "Original exception: %s",
                self._socket_dir, exc_str(exc)
            )

        lgr.log(5,
                "Found %d previous connections",
                len(self._prev_connections))
Esempio n. 50
0
def _get_github_entity(gh, cred, github_login, github_passwd, github_organization):
    if github_login == 'disabledloginfortesting':
        raise gh.BadCredentialsException(403, 'no login specified')
    if not (github_login and github_passwd):
        # we don't have both
        # check if there is an oauth token from
        # https://github.com/sociomantic/git-hub
        token = False
        if not cred.is_known:
            if not github_login:
                # try find a token as login
                github_login = cfg.get('hub.oauthtoken', None)
                token = True
            if not (github_login and (github_passwd or token)):
                # still at least one missing, utilize the credential store
                # to get auth info, pass potential passwd value along
                cred.enter_new(
                    user=github_login,
                    password=github_passwd)
        # now we should really have it
        creds = cred()
        github_login = creds['user']
        github_passwd = creds['password']

    if not github_login:
        raise gh.BadCredentialsException(403, 'no login specified')

    # this will always succeed, but it might later throw an exception
    # if the credentials were wrong
    # and this case, known credentials are wiped out again below
    authed_gh = gh.Github(
        github_login,
        password=github_passwd)

    try:
        if github_organization:
            try:
                entity = authed_gh.get_organization(github_organization)
            except gh.UnknownObjectException as e:
                raise ValueError('unknown organization "{}" [{}]'.format(
                                 github_organization,
                                 exc_str(e)))
        else:
            entity = authed_gh.get_user()
    except gh.BadCredentialsException as e:
        # things blew up, wipe out cred store, if anything is in it
        if cred.is_known:
            cred.delete()
        raise e

    return entity
Esempio n. 51
0
File: get.py Progetto: hanke/datalad
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None,
                                        refds_path=None, description=None):
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.subdatasets(
            return_type='generator', result_renderer='disabled'):
        subds = Dataset(sub['path'])
        if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip':
            lgr.debug(
                "subdataset %s is configured to be skipped on recursive installation",
                sub['path'])
            continue
        if start is not None and not path_is_subpath(subds.path, start):
            # this one we can ignore, not underneath the start path
            continue
        if sub.get('state', None) != 'absent':
            # dataset was already found to exist
            yield get_status_dict(
                'install', ds=subds, status='notneeded', logger=lgr,
                refds=refds_path)
            # do not continue, even if an intermediate dataset exists it
            # does not imply that everything below it does too
        else:
            # try to get this dataset
            try:
                subds = _install_subds_from_flexible_source(
                    ds,
                    relpath(sub['path'], start=ds.path),
                    sub['gitmodule_url'],
                    reckless,
                    description=description)
                yield get_status_dict(
                    'install', ds=subds, status='ok', logger=lgr, refds=refds_path,
                    message=("Installed subdataset %s", subds), parentds=ds.path)
            except Exception as e:
                # skip all of downstairs, if we didn't manage to install subdataset
                yield get_status_dict(
                    'install', ds=subds, status='error', logger=lgr, refds=refds_path,
                    message=("Installation of subdatasets %s failed with exception: %s",
                             subds, exc_str(e)))
                continue
        # otherwise recurse
        # we can skip the start expression, we know we are within
        for res in _recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless,
                refds_path=refds_path):
            yield res
Esempio n. 52
0
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = ["git", "-C", path, "init"]
        if shared:
            cmd.append("--shared=%s" % shared)
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(
                    ["git", "-C", path, "annex", "init"] +
                    ([description] if description else [])
                )
            except CommandError as e:
                lgr.error("Initialization of remote git annex repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True
Esempio n. 53
0
def get_bucket(conn, bucket_name):
    """A helper to get a bucket

    Parameters
    ----------
    bucket_name: str
        Name of the bucket to connect to
    """
    bucket = None
    try:
        bucket = conn.get_bucket(bucket_name)
    except S3ResponseError as e:
        # can initially deny or error to connect to the specific bucket by name,
        # and we would need to list which buckets are available under following
        # credentials:
        lgr.debug("Cannot access bucket %s by name: %s", bucket_name, exc_str(e))
        if conn.anon:
            raise AnonymousAccessDeniedError(
                "Access to the bucket %s did not succeed.  Requesting "
                "'all buckets' for anonymous S3 connection makes "
                "little sense and thus not supported." % bucket_name,
                supported_types=['aws-s3']
            )
        all_buckets = []
        try:
            all_buckets = conn.get_all_buckets()
        except S3ResponseError as e2:
            lgr.debug("Cannot access all buckets: %s", exc_str(e2))
            _handle_exception(e, 'any (originally requested %s)' % bucket_name)
        all_bucket_names = [b.name for b in all_buckets]
        lgr.debug("Found following buckets %s", ', '.join(all_bucket_names))
        if bucket_name in all_bucket_names:
            bucket = all_buckets[all_bucket_names.index(bucket_name)]
        else:
            _handle_exception(e, bucket_name)
    return bucket
Esempio n. 54
0
def _revs_as_results(dset, revs):
    for rev in revs:
        res = get_status_dict("run", ds=dset, commit=rev)
        full_msg = dset.repo.format_commit("%B", rev)
        try:
            msg, info = get_run_info(dset, full_msg)
        except ValueError as exc:
            # Recast the error so the message includes the revision.
            raise ValueError(
                "Error on {}'s message: {}".format(rev, exc_str(exc)))

        if info is not None:
            res["run_info"] = info
            res["run_message"] = msg
        yield dict(res, status="ok")
Esempio n. 55
0
    def get_remote_git_version(ssh):
        try:
            # options to disable all auto so we don't trigger them while testing
            # for absent changes
            out, err = ssh(["git"] + ["version"])
            assert out.strip().startswith("git version")
            git_version = out.strip().split()[2]
            lgr.debug("Detected git version on server: %s" % git_version)
            return LooseVersion(git_version)

        except CommandError as e:
            lgr.warning(
                "Failed to determine git version on remote.\n"
                "Error: {0}\nTrying to configure anyway "
                "...".format(exc_str(e)))
        return None
Esempio n. 56
0
    def __call__(paths,
                 reference_date="@1514764800",
                 revs=None,
                 annex="all",
                 no_tags=False,
                 older=False):
        from datalad.support.repodates import check_dates

        which = "older" if older else "newer"

        try:
            ref_ts = _parse_date(reference_date)
        except ValueError as exc:
            lgr.error("Could not parse '%s' as a date", reference_date)
            yield get_status_dict("check_dates",
                                  status="error",
                                  message=exc_str(exc))
            return

        lgr.info("Searching for dates %s than %s",
                 which,
                 time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts)))

        for repo in _git_repos(paths or ["."]):
            fullpath = os.path.abspath(repo)
            lgr.debug("Checking %s", fullpath)

            try:
                report = check_dates(repo,
                                     ref_ts,
                                     which=which,
                                     revs=revs or ["--all"],
                                     annex={"all": True,
                                            "none": False,
                                            "tree": "tree"}[annex],
                                     tags=not no_tags)
            except InvalidGitRepositoryError as exc:
                lgr.warning("Skipping invalid Git repo: %s", repo)
                continue

            yield get_status_dict(
                "check_dates",
                status="ok",
                path=fullpath,
                message=("Found {} dates" if report["objects"]
                         else "No {} dates found").format(which),
                report=report)
Esempio n. 57
0
def get_run_info(dset, message):
    """Extract run information from `message`

    Parameters
    ----------
    message : str
        A commit message.

    Returns
    -------
    A tuple with the command's message and a dict with run information. Both
    these values are None if `message` doesn't have a run command.

    Raises
    ------
    A ValueError if the information in `message` is invalid.
    """
    cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \
                   r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^'
    runinfo = re.match(cmdrun_regex, message, re.MULTILINE | re.DOTALL)
    if not runinfo:
        return None, None

    rec_msg, runinfo = runinfo.groups()

    try:
        runinfo = json.loads(runinfo)
    except Exception as e:
        raise ValueError(
            'cannot rerun command, command specification is not valid JSON: '
            '%s' % exc_str(e)
        )
    if not isinstance(runinfo, (list, dict)):
        # this is a run record ID -> load the beast
        record_dir = dset.config.get(
            'datalad.run.record-directory',
            default=op.join('.datalad', 'runinfo'))
        record_path = op.join(dset.path, record_dir, runinfo)
        if not op.lexists(record_path):
            raise ValueError("Run record sidecar file not found: {}".format(record_path))
        # TODO `get` the file
        recs = load_stream(record_path, compressed=True)
        # TODO check if there is a record
        runinfo = next(recs)
    if 'cmd' not in runinfo:
        raise ValueError("Looks like a run commit but does not have a command")
    return rec_msg.rstrip(), runinfo
Esempio n. 58
0
def get_cached_url_content(url, name=None, fetcher=None, maxage=None):
    """Loader of a document from a url, which caches loaded instance on disk

    Doesn't do anything smart about http headers etc which could provide
    information for cache/proxy servers for how long to retain etc

    TODO: theoretically it is not network specific at all -- and just a memoize
    pattern, but may be some time we would make it treat headers etc correctly.
    And ATM would support any URL we support via providers/downloaders

    Parameters
    ----------
    fetcher: callable, optional
       Function to call with url if needed to be refetched
    maxage: float, optional
       Age in days to retain valid for.  <0 - would retain forever.  If None -
       would consult the config, 0 - would force to reload
    """
    doc_fname = get_url_cache_filename(url, name)
    if maxage is None:
        maxage = float(cfg.get('datalad.locations.cache-maxage'))

    doc = None
    if os.path.exists(doc_fname) and maxage != 0:

        fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600)
        if maxage < 0 or fage < maxage:
            try:
                lgr.debug("use cached request result to '%s' from %s", url, doc_fname)
                doc = pickle.load(open(doc_fname, 'rb'))
            except Exception as e:  # it is OK to ignore any error and fall back on the true source
                lgr.warning(
                    "cannot load cache from '%s', fall back to download: %s",
                    doc_fname, exc_str(e))

    if doc is None:
        if fetcher is None:
            from datalad.downloaders.providers import Providers
            providers = Providers.from_config_files()
            fetcher = providers.fetch

        doc = fetcher(url)
        assure_dir(dirname(doc_fname))
        # use pickle to store the entire request result dict
        pickle.dump(doc, open(doc_fname, 'wb'))
        lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname))
    return doc
Esempio n. 59
0
def get_native_metadata(ds, guess_type=False, ds_identifier=None):
    """Parse a dataset to gather its native metadata

    Returns
    -------
    List
        Each item in the list is a metadata dictionary (JSON-LD compliant).
        The first items corresponds to the annex-based metadata of the dataset.
        The last items contains the native metadata of the dataset content. Any
        additional items correspond to subdataset metadata sets.
    """
    if ds_identifier is None:
        ds_identifier = ds.id
    # using a list, because we could get multiple sets of meta data per
    # dataset, and we want to quickly collect them without having to do potentially
    # complex graph merges
    meta = []
    # get native metadata
    nativetypes = get_metadata_type(ds, guess=guess_type)
    if not nativetypes:
        return meta

    # keep local, who knows what some parsers might pull in
    from . import parsers
    for nativetype in nativetypes:
        if nativetype == 'aggregate':
            # this is special and needs to be ignored here, even if it was
            # configured. reason: this parser runs anyway in get_metadata()
            continue
        pmod = import_module('.{}'.format(nativetype),
                             package=parsers.__package__)
        try:
            native_meta = pmod.MetadataParser(ds).get_metadata(ds_identifier)
        except Exception as e:
            lgr.error('failed to get native metadata ({}): {}'.format(nativetype, exc_str(e)))
            continue
        if native_meta:
            # TODO here we could apply a "patch" to the native metadata, if desired

            # try hard to keep things a simple non-nested list
            if isinstance(native_meta, list):
                meta.extend(native_meta)
            else:
                meta.append(native_meta)

    return meta