def find_explicit_inputs(self):
        """Yield explicit inputs and command line input bindings if any."""
        input_paths = [
            input.default.path for input in self.inputs
            if input.type in PATH_OBJECTS
        ]
        input_id = len(self.inputs) + len(self.arguments)

        for explicit_input in self.explicit_inputs:
            if explicit_input in input_paths:
                continue

            try:
                explicit_input.relative_to(self.working_dir)
            except ValueError:
                raise errors.UsageError(
                    "The input file or directory is not in the repository."
                    "\n\n\t" + click.style(str(explicit_input), fg="yellow") +
                    "\n\n")
            if self.is_existing_path(explicit_input) is None:
                raise errors.UsageError(
                    "The input file or directory does not exist."
                    "\n\n\t" + click.style(str(explicit_input), fg="yellow") +
                    "\n\n")
            input_id += 1
            default, type, _ = self.guess_type(explicit_input)
            # Explicit inputs are either File or Directory
            assert type in PATH_OBJECTS
            # The inputBinging is None because these inputs won't
            # appear on command-line
            yield CommandInputParameter(id="input_{0}".format(input_id),
                                        type=type,
                                        default=default,
                                        inputBinding=None)
Example #2
0
def config(key, value, remove, local_only, global_only):
    """Manage configuration options."""
    is_write = value is not None

    if is_write and remove:
        raise errors.UsageError('Cannot remove and set at the same time.')
    if remove and not key:
        raise errors.UsageError('KEY is missing.')
    if local_only and global_only:
        raise errors.UsageError('Cannot use --local and --global together.')

    if remove:
        update_config(key, remove=remove, global_only=global_only)
    elif is_write:
        update_config(key, value=value, global_only=global_only)
    else:
        value = read_config(key, local_only, global_only)
        click.secho(value)
Example #3
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            sources=(),
                            destination='',
                            ref=None,
                            link=False,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        warning_message = ''
        dataset_path = self.path / self.datadir / dataset.short_name

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_path, destination)
        destination = self.path / dataset_path / destination

        files = []
        if all_at_once:  # only for URLs
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset, url, sources,
                                                   destination, ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_message = 'Adding data from local Git ' \
                                'repository. Use remote\'s Git URL instead ' \
                                'to enable lineage information and updates.'
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset, u.path, link, external, destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset,
                                                       url,
                                                       destination,
                                                       extract,
                                                       progress=progress)

                files.extend(new_files)

        files_to_commit = {f['path'] for f in files if f['path']}
        ignored = self.find_ignored_paths(*files_to_commit)

        if not force:
            if ignored:
                raise errors.IgnoredFiles(ignored)
            if dataset.contains_any(files):
                raise errors.DatasetFileExists()

        # all files at this point can be force-added and overwritten

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'link':
                try:
                    os.link(src, dst)
                except Exception as e:
                    raise errors.OperationError(
                        'Could not create hard link. Retry without "--link."'
                    ) from e
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True

        # Track non-symlinks in LFS
        self.track_paths_in_storage(*files_to_commit)

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            self.repo.index.commit(msg)

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            if os.path.basename(str(data['path'])) == '.git':
                continue

            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_message
Example #4
0
 def validate_path(self, attribute, value):
     """Path must exists."""
     if not value.exists():
         raise errors.UsageError('Directory must exist.')
Example #5
0
 def validate_command_line(self, attribute, value):
     """Check the command line structure."""
     if not value:
         raise errors.UsageError('Command line can not be empty.')
Example #6
0
def run(
    client,
    explicit_inputs,
    explicit_outputs,
    no_output,
    no_input_detection,
    no_output_detection,
    success_codes,
    isolation,
    command_line,
):
    """Tracking work on a specific problem."""
    paths = explicit_outputs if no_output_detection else client.candidate_paths
    mapped_std = get_mapped_std_streams(paths, streams=("stdout", "stderr"))

    paths = explicit_inputs if no_input_detection else client.candidate_paths
    mapped_std_in = get_mapped_std_streams(paths, streams=("stdin", ))
    mapped_std.update(mapped_std_in)

    invalid = get_mapped_std_streams(explicit_inputs,
                                     streams=("stdout", "stderr"))
    if invalid:
        raise errors.UsageError(
            "Explicit input file cannot be used as stdout/stderr:"
            "\n\t" + click.style("\n\t".join(invalid.values()), fg="yellow") +
            "\n")

    invalid = get_mapped_std_streams(explicit_outputs, streams=("stdin", ))
    if invalid:
        raise errors.UsageError(
            "Explicit output file cannot be used as stdin:"
            "\n\t" + click.style("\n\t".join(invalid.values()), fg="yellow") +
            "\n")

    system_stdout = None
    system_stderr = None

    # /dev/tty is a virtual device that points to the terminal
    # of the currently executed process
    try:
        with open("/dev/tty", "w"):
            tty_exists = True
    except OSError:
        tty_exists = False

    try:
        stdout_redirected = "stdout" in mapped_std
        stderr_redirected = "stderr" in mapped_std

        if tty_exists:
            # if renku was called with redirected stdout/stderr, undo the
            # redirection here so error messages can be printed normally
            if stdout_redirected:
                system_stdout = open("/dev/tty", "w")
                old_stdout = sys.stdout
                sys.stdout = system_stdout

            if stderr_redirected:
                system_stderr = open("/dev/tty", "w")
                old_stderr = sys.stderr
                sys.stderr = system_stderr

        working_dir = client.repo.working_dir
        factory = CommandLineToolFactory(
            command_line=command_line,
            explicit_inputs=explicit_inputs,
            explicit_outputs=explicit_outputs,
            directory=os.getcwd(),
            working_dir=working_dir,
            no_input_detection=no_input_detection,
            no_output_detection=no_output_detection,
            successCodes=success_codes,
            **{
                name: os.path.relpath(path, working_dir)
                for name, path in mapped_std.items()
            },
        )
        with client.with_workflow_storage() as wf:
            with factory.watch(client, no_output=no_output) as tool:
                # Don't compute paths if storage is disabled.
                if client.check_external_storage():
                    # Make sure all inputs are pulled from a storage.
                    paths_ = (path for _, path in tool.iter_input_files(
                        client.workflow_path))
                    client.pull_paths_from_storage(*paths_)

                if tty_exists:
                    # apply original output redirection
                    if stdout_redirected:
                        sys.stdout = old_stdout
                    if stderr_redirected:
                        sys.stderr = old_stderr

                return_code = call(
                    factory.command_line,
                    cwd=os.getcwd(),
                    **{key: getattr(sys, key)
                       for key in mapped_std.keys()},
                )

                sys.stdout.flush()
                sys.stderr.flush()

                if tty_exists:
                    # change back to /dev/tty redirection
                    if stdout_redirected:
                        sys.stdout = system_stdout
                    if stderr_redirected:
                        sys.stderr = system_stderr

                if return_code not in (success_codes or {0}):
                    raise errors.InvalidSuccessCode(
                        return_code, success_codes=success_codes)

                wf.add_step(run=tool)

        if factory.messages:
            click.echo(factory.messages)

        if factory.warnings:
            click.echo(factory.warnings)

    finally:
        if system_stdout:
            sys.stdout = old_stdout
            system_stdout.close()
        if system_stderr:
            sys.stderr = old_stderr
            system_stderr.close()
Example #7
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            overwrite=False,
                            sources=(),
                            destination='',
                            ref=None,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        messages = []
        warning_messages = []
        dataset_datadir = self.path / dataset.data_dir

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_datadir, destination)
        destination = self.path / dataset_datadir / destination

        if destination.exists() and not destination.is_dir():
            raise errors.ParameterError(
                f'Destination is not a directory: "{destination}"')

        self.check_external_storage()

        files = []
        if all_at_once:  # Importing a dataset
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset=dataset,
                                                   url=url,
                                                   sources=sources,
                                                   destination=destination,
                                                   ref=ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_messages.append(
                                'Adding data from local Git repository: ' +
                                'Use remote\'s Git URL instead to enable ' +
                                'lineage information and updates.')
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset=dataset,
                            path=u.path,
                            external=external,
                            destination=destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset=dataset,
                                                       url=url,
                                                       destination=destination,
                                                       extract=extract,
                                                       progress=progress)

                files.extend(new_files)

        # Remove all files that are under a .git directory
        paths_to_avoid = [
            f['path'] for f in files
            if '.git' in str(f['path']).split(os.path.sep)
        ]
        if paths_to_avoid:
            files = [f for f in files if f['path'] not in paths_to_avoid]
            warning_messages.append(
                'Ignored adding paths under a .git directory:\n  ' +
                '\n  '.join(str(p) for p in paths_to_avoid))

        files_to_commit = {str(self.path / f['path']) for f in files}

        if not force:
            ignored_files = self.find_ignored_paths(*files_to_commit)
            if ignored_files:
                ignored_files = set(ignored_files)
                files_to_commit = files_to_commit.difference(ignored_files)
                ignored_sources = []
                for file_ in files:
                    if str(self.path / file_['path']) in ignored_files:
                        operation = file_.get('operation')
                        if operation:
                            src, _, _ = operation
                            ignored_sources.append(src)
                        else:
                            ignored_sources.append(file_['path'])

                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'Theses paths are ignored by one of your .gitignore ' +
                    'files (use "--force" flag if you really want to add ' +
                    'them):\n  ' +
                    '\n  '.join([str(p) for p in ignored_sources]))

        # all files at this point can be force-added

        if not overwrite:
            existing_files = dataset.find_files(files_to_commit)
            if existing_files:
                files_to_commit = files_to_commit.difference(existing_files)
                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'These existing files were not overwritten ' +
                    '(use "--overwrite" flag to overwrite them):\n  ' +
                    '\n  '.join([str(p) for p in existing_files]))

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'move':
                shutil.move(src, dst, copy_function=shutil.copy)
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True
            else:
                raise errors.OperationError(f'Invalid action {action}')

        # Track non-symlinks in LFS
        if self.check_external_storage():
            lfs_paths = self.track_paths_in_storage(*files_to_commit)
            show_message = self.get_value('renku', 'show_lfs_message')
            if (lfs_paths
                    and (show_message is None or show_message == 'True')):
                messages.append(
                    ('Adding these files to Git LFS:\n' +
                     '\t{}'.format('\n\t'.join(lfs_paths)) +
                     '\nTo disable this message in the future, run:' +
                     '\n\trenku config show_lfs_message False'))

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            skip_hooks = not self.external_storage_requested
            self.repo.index.commit(msg, skip_hooks=skip_hooks)
        else:
            warning_messages.append('No file was added to project')

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_messages, messages
Example #8
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            sources=(),
                            destination='',
                            ref=None,
                            link=False,
                            extract=False,
                            all_at_once=False,
                            progress=None):
        """Import the data into the data directory."""
        warning_message = ''
        dataset_path = self.path / self.datadir / dataset.short_name

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_path, destination)
        destination = self.path / dataset_path / destination

        files = []

        if all_at_once:  # only for URLs
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset, url, sources,
                                                   destination, ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_message = 'Adding data from local Git ' \
                                'repository. Use remote\'s Git URL instead ' \
                                'to enable lineage information and updates.'
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset, u.path, link, destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset, url,
                                                       destination, extract)

                files.extend(new_files)

        self.track_paths_in_storage(*(f['path'] for f in files))

        ignored = self.find_ignored_paths(*(data['path']
                                            for data in files)) or []

        if ignored:
            if force:
                self.repo.git.add(*ignored, force=True)
            else:
                raise errors.IgnoredFiles(ignored)

        if dataset.contains_any(files) and force is False:
            raise errors.DatasetFileExists()

        # commit all new data
        file_paths = {str(data['path']) for data in files if str(data['path'])}
        files_to_add = (file_paths - set(ignored))

        self.repo.git.add(*files_to_add)

        if self.repo.is_dirty():
            commit_msg = ('renku dataset: '
                          'committing {} newly added files'
                          ).format(len(file_paths) + len(ignored))

            self.repo.index.commit(commit_msg)

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            if os.path.basename(str(data['path'])) == '.git':
                continue

            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_message
    def guess_outputs(self, candidates):
        """Yield detected output and changed command input parameter."""
        # TODO what to do with duplicate paths & inputs with same defaults
        candidates = list(candidates)
        tree = DirectoryTree.from_list(candidates)

        input_candidates = {}
        conflicting_paths = {}

        for index, input in enumerate(self.inputs):
            # Convert input defaults to paths relative to working directory.
            if input.type not in PATH_OBJECTS:
                if self.no_input_detection:
                    continue
                try:
                    path = self.directory / str(input.default)
                    input_path = Path(os.path.abspath(path)).relative_to(
                        self.working_dir)
                except FileNotFoundError:
                    continue
            else:
                input_path = input.default.path.relative_to(self.working_dir)

            if input_path.is_dir() and tree.get(input_path):
                # The directory might exist before running the script
                subpaths = {
                    str(input_path / path)
                    for path in tree.get(input_path, default=[])
                }
                absolute_path = os.path.abspath(input_path)
                if Path(absolute_path) not in self.explicit_outputs:
                    content = {
                        str(path)
                        for path in input_path.rglob("*")
                        if not path.is_dir() and path.name != ".gitkeep"
                    }
                    preexisting_paths = content - subpaths
                    if preexisting_paths:
                        raise errors.InvalidOutputPath(
                            'The output directory "{0}" is not empty. \n\n'
                            "Delete existing files before running the "
                            "command:"
                            '\n  (use "git rm <file>..." to remove them '
                            "first)"
                            "\n\n".format(input_path) +
                            "\n".join("\t" + click.style(path, fg="yellow")
                                      for path in preexisting_paths) + "\n\n"
                            "Once you have removed files that should be used "
                            "as outputs,\n"
                            "you can safely rerun the previous command.")

                # Remove files from the input directory
                candidates[:] = (path for path in candidates
                                 if path not in subpaths)
                # Include input path in the candidates to check
                candidates.append(str(input_path))

                input_candidates[str(input_path)] = input
            elif input.type not in PATH_OBJECTS:
                # Input need to be changed if an output is detected
                input_candidates[str(input_path)] = input
            else:
                # Names that can not be outputs because they are already inputs
                conflicting_paths[str(input_path)] = input

        streams = {
            path
            for path in (getattr(self, name) for name in ("stdout", "stderr"))
            if path is not None
        }

        # TODO group by a common prefix

        for position, path in enumerate(candidates):
            candidate = self.is_existing_path(self.working_dir / path)

            if candidate is None:
                raise errors.UsageError(
                    'Path "{0}" does not exist.'.format(path))

            glob = str(candidate.relative_to(self.working_dir))

            if glob in streams:
                continue

            new_input = None

            if glob in conflicting_paths:
                # it means that it is rewriting a file
                input = conflicting_paths[glob]
                new_input = attr.evolve(input, type="string", default=glob)
                input_candidates[glob] = new_input

                del conflicting_paths[glob]
                # TODO add warning ('Output already exists in inputs.')

            candidate_type = "Directory" if candidate.is_dir() else "File"

            if glob in input_candidates:
                input = input_candidates[glob]

                if new_input is None:
                    new_input = input_candidates[glob] = attr.evolve(
                        input, type="string", default=glob)

                yield (
                    CommandOutputParameter(
                        id="output_{0}".format(position),
                        type=candidate_type,
                        outputBinding=dict(glob="$(inputs.{0})".format(
                            input.id), ),
                    ),
                    new_input,
                    glob,
                )
            else:
                yield (
                    CommandOutputParameter(
                        id="output_{0}".format(position),
                        type=candidate_type,
                        outputBinding=dict(glob=glob, ),
                    ),
                    None,
                    glob,
                )