Ejemplo n.º 1
0
    def update_dataset_files(self, files, ref, delete=False):
        """Update files and dataset metadata according to their remotes.

        :param files: List of files to be updated
        :param delete: Indicates whether to delete files or not

        :return: List of files that should be deleted
        """
        from renku import LocalClient

        visited_repos = {}
        updated_files = []
        deleted_files = []

        for file_ in files:
            if not file_.based_on:
                continue

            file_.based_on = DatasetFile.from_jsonld(file_.based_on)
            based_on = file_.based_on
            url = based_on.url
            if url in visited_repos:
                repo, repo_path, remote_client = visited_repos[url]
            else:
                repo, repo_path = self.prepare_git_repo(url, ref)
                remote_client = LocalClient(repo_path)
                visited_repos[url] = repo, repo_path, remote_client

            remote_file = self._fetch_file_metadata(remote_client,
                                                    based_on.path)

            if not remote_file:
                try:
                    remote_file = DatasetFile.from_revision(
                        remote_client,
                        path=based_on.path,
                        url=url,
                        added=based_on.added)
                except KeyError:
                    raise errors.ParameterError(
                        'Cannot find file {} in the repo {}'.format(
                            based_on.url, url))

            commit_sha = self._get_commit_sha_from_label(based_on)
            remote_commit_sha = self._get_commit_sha_from_label(remote_file)
            if commit_sha != remote_commit_sha:
                src = Path(repo.working_dir) / based_on.path
                dst = self.renku_path.parent / file_.path

                if src.exists():
                    # Fetch file is it is tracked by Git LFS
                    self._fetch_lfs_files(repo_path, {based_on.path})
                    if remote_client._is_external_file(src):
                        self.remove_file(dst)
                        self._create_external_file(src.resolve(), dst)
                    else:
                        shutil.copy(src, dst)
                    file_.based_on.commit = remote_file.commit
                    file_.based_on._label = remote_file._label
                    updated_files.append(file_)
                else:
                    # File was removed or renamed
                    if delete:
                        self.remove_file(dst)
                    deleted_files.append(file_)

        if not updated_files and (not delete or not deleted_files):
            # Nothing to commit or update
            return deleted_files

        # Commit changes in files

        file_paths = {str(f.path) for f in updated_files + deleted_files}
        # Force-add to include possible ignored files that are in datasets
        self.repo.git.add(*(file_paths), force=True)
        self.repo.index.commit(
            'renku dataset: updated {} files and deleted {} files'.format(
                len(updated_files), len(deleted_files)))

        # Update datasets' metadata

        modified_datasets = {}

        for file_ in updated_files:
            # Re-create list of creators
            creators = []
            # grab all the creators from the commit history
            for commit in repo.iter_commits(paths=file_.path):
                creator = Person.from_commit(commit)
                if creator not in creators:
                    creators.append(creator)

            new_file = DatasetFile.from_revision(self,
                                                 path=file_.path,
                                                 based_on=file_.based_on,
                                                 creator=creators)
            file_.dataset.update_files([new_file])
            modified_datasets[file_.dataset.name] = file_.dataset

        if delete:
            for file_ in deleted_files:
                file_.dataset.unlink_file(file_.path)
                modified_datasets[file_.dataset.name] = file_.dataset

        for dataset in modified_datasets.values():
            dataset.to_yaml()

        return deleted_files
Ejemplo n.º 2
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            sources=(),
                            destination='',
                            ref=None,
                            link=False,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        warning_message = ''
        dataset_path = self.path / self.datadir / dataset.short_name

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_path, destination)
        destination = self.path / dataset_path / destination

        files = []
        if all_at_once:  # only for URLs
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset, url, sources,
                                                   destination, ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_message = 'Adding data from local Git ' \
                                'repository. Use remote\'s Git URL instead ' \
                                'to enable lineage information and updates.'
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset, u.path, link, external, destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset,
                                                       url,
                                                       destination,
                                                       extract,
                                                       progress=progress)

                files.extend(new_files)

        files_to_commit = {f['path'] for f in files if f['path']}
        ignored = self.find_ignored_paths(*files_to_commit)

        if not force:
            if ignored:
                raise errors.IgnoredFiles(ignored)
            if dataset.contains_any(files):
                raise errors.DatasetFileExists()

        # all files at this point can be force-added and overwritten

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'link':
                try:
                    os.link(src, dst)
                except Exception as e:
                    raise errors.OperationError(
                        'Could not create hard link. Retry without "--link."'
                    ) from e
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True

        # Track non-symlinks in LFS
        self.track_paths_in_storage(*files_to_commit)

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            self.repo.index.commit(msg)

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            if os.path.basename(str(data['path'])) == '.git':
                continue

            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_message
Ejemplo n.º 3
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=unused_sources)

        if destination.exists() and not destination.is_dir():
            if len(files) > 1:
                raise errors.ParameterError(
                    'Cannot copy multiple files or directories to a file')

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url,
                                                         creator=creators)

                path_in_dst_repo = dst.relative_to(self.path)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, 'symlink')
                else:
                    operation = (src, dst, 'copy')

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on,
                    'operation': operation
                })

        return results
def _migrate_submodule_based_datasets(client):
    from renku.core.management import LocalClient
    from renku.core.management.migrate import is_project_unsupported

    submodules = client.repo.submodules
    if not submodules:
        return

    for s in submodules:
        try:
            s.update()
        except GitError:
            pass

    submodules_urls = {s.path: s.url for s in submodules}

    repo_paths = []
    symlinks = []

    for dataset in client.datasets.values():
        for file_ in dataset.files:
            path = client.path / file_.path
            if not path.is_symlink():
                continue

            target = path.resolve()

            if '/.renku/vendors/' not in str(target):
                continue

            repo = Repo(target.parent, search_parent_directories=True)
            repo_path = repo.working_dir
            if repo_path not in repo_paths:
                repo_paths.append(repo_path)

            symlinks.append((file_.path, target, repo_path))

    if not symlinks:
        return

    remote_clients = {p: LocalClient(p) for p in repo_paths}

    for remote_client in remote_clients.values():
        if not is_project_unsupported(remote_client):
            migrate(remote_client)

    metadata = {}

    for path, target, repo_path in symlinks:
        remote_client = remote_clients[repo_path]
        path_within_repo = target.relative_to(repo_path)

        repo_is_remote = '.renku/vendors/local' not in repo_path
        based_on = None
        submodule_path = Path(repo_path).relative_to(client.path)

        url = submodules_urls.get(str(submodule_path), '')

        if repo_is_remote:
            based_on = _fetch_file_metadata(remote_client, path_within_repo)
            if based_on:
                based_on.url = url
                based_on.based_on = None
            else:
                based_on = DatasetFile.from_revision(remote_client,
                                                     path=path_within_repo,
                                                     url=url)
        else:
            if url:
                full_path = Path(url) / path_within_repo
                rel_path = os.path.relpath(full_path, client.path)
                url = f'file://{rel_path}'

        metadata[path] = (based_on, url)

        path = client.path / path
        path.unlink()

        try:
            shutil.move(target, path)
        except FileNotFoundError:
            raise errors.InvalidFileOperation(f'File was not found: {target}')

    for s in submodules:
        if s.path.startswith('.renku/vendors/'):
            try:
                s.remove(force=True)
            except ValueError:
                pass

    for dataset in client.datasets.values():
        for file_ in dataset.files:
            if file_.path in metadata:
                based_on, url = metadata[file_.path]
                file_.based_on = based_on
                file_.url = remove_credentials(url)

        dataset.to_yaml()
Ejemplo n.º 5
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            overwrite=False,
                            sources=(),
                            destination='',
                            ref=None,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        messages = []
        warning_messages = []
        dataset_datadir = self.path / dataset.data_dir

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_datadir, destination)
        destination = self.path / dataset_datadir / destination

        if destination.exists() and not destination.is_dir():
            raise errors.ParameterError(
                f'Destination is not a directory: "{destination}"')

        self.check_external_storage()

        files = []
        if all_at_once:  # Importing a dataset
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset=dataset,
                                                   url=url,
                                                   sources=sources,
                                                   destination=destination,
                                                   ref=ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_messages.append(
                                'Adding data from local Git repository: ' +
                                'Use remote\'s Git URL instead to enable ' +
                                'lineage information and updates.')
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset=dataset,
                            path=u.path,
                            external=external,
                            destination=destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset=dataset,
                                                       url=url,
                                                       destination=destination,
                                                       extract=extract,
                                                       progress=progress)

                files.extend(new_files)

        # Remove all files that are under a .git directory
        paths_to_avoid = [
            f['path'] for f in files
            if '.git' in str(f['path']).split(os.path.sep)
        ]
        if paths_to_avoid:
            files = [f for f in files if f['path'] not in paths_to_avoid]
            warning_messages.append(
                'Ignored adding paths under a .git directory:\n  ' +
                '\n  '.join(str(p) for p in paths_to_avoid))

        files_to_commit = {str(self.path / f['path']) for f in files}

        if not force:
            ignored_files = self.find_ignored_paths(*files_to_commit)
            if ignored_files:
                ignored_files = set(ignored_files)
                files_to_commit = files_to_commit.difference(ignored_files)
                ignored_sources = []
                for file_ in files:
                    if str(self.path / file_['path']) in ignored_files:
                        operation = file_.get('operation')
                        if operation:
                            src, _, _ = operation
                            ignored_sources.append(src)
                        else:
                            ignored_sources.append(file_['path'])

                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'Theses paths are ignored by one of your .gitignore ' +
                    'files (use "--force" flag if you really want to add ' +
                    'them):\n  ' +
                    '\n  '.join([str(p) for p in ignored_sources]))

        # all files at this point can be force-added

        if not overwrite:
            existing_files = dataset.find_files(files_to_commit)
            if existing_files:
                files_to_commit = files_to_commit.difference(existing_files)
                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'These existing files were not overwritten ' +
                    '(use "--overwrite" flag to overwrite them):\n  ' +
                    '\n  '.join([str(p) for p in existing_files]))

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'move':
                shutil.move(src, dst, copy_function=shutil.copy)
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True
            else:
                raise errors.OperationError(f'Invalid action {action}')

        # Track non-symlinks in LFS
        if self.check_external_storage():
            lfs_paths = self.track_paths_in_storage(*files_to_commit)
            show_message = self.get_value('renku', 'show_lfs_message')
            if (lfs_paths
                    and (show_message is None or show_message == 'True')):
                messages.append(
                    ('Adding these files to Git LFS:\n' +
                     '\t{}'.format('\n\t'.join(lfs_paths)) +
                     '\nTo disable this message in the future, run:' +
                     '\n\trenku config show_lfs_message False'))

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            skip_hooks = not self.external_storage_requested
            self.repo.index.commit(msg, skip_hooks=skip_hooks)
        else:
            warning_messages.append('No file was added to project')

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_messages, messages
Ejemplo n.º 6
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self._prepare_git_repo(url, ref)
        copied_sources = set()
        files = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination)

            if result:
                files.add(result)
                source = result[3]
                copied_sources.add(source)

        uncopied_sources = sources - copied_sources
        if uncopied_sources:
            uncopied_sources = {str(s) for s in uncopied_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=uncopied_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _, __ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                path = str(src.resolve().relative_to(repo_path))
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst, _ in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url)

                path_in_dst_repo = dst.relative_to(self.path)

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on
                })

                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy(str(src), str(dst))

        return results
Ejemplo n.º 7
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            sources=(),
                            destination='',
                            ref=None,
                            link=False,
                            extract=False,
                            all_at_once=False,
                            progress=None):
        """Import the data into the data directory."""
        warning_message = ''
        dataset_path = self.path / self.datadir / dataset.short_name

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_path, destination)
        destination = self.path / dataset_path / destination

        files = []

        if all_at_once:  # only for URLs
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset, url, sources,
                                                   destination, ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_message = 'Adding data from local Git ' \
                                'repository. Use remote\'s Git URL instead ' \
                                'to enable lineage information and updates.'
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset, u.path, link, destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset, url,
                                                       destination, extract)

                files.extend(new_files)

        self.track_paths_in_storage(*(f['path'] for f in files))

        ignored = self.find_ignored_paths(*(data['path']
                                            for data in files)) or []

        if ignored:
            if force:
                self.repo.git.add(*ignored, force=True)
            else:
                raise errors.IgnoredFiles(ignored)

        if dataset.contains_any(files) and force is False:
            raise errors.DatasetFileExists()

        # commit all new data
        file_paths = {str(data['path']) for data in files if str(data['path'])}
        files_to_add = (file_paths - set(ignored))

        self.repo.git.add(*files_to_add)

        if self.repo.is_dirty():
            commit_msg = ('renku dataset: '
                          'committing {} newly added files'
                          ).format(len(file_paths) + len(ignored))

            self.repo.index.commit(commit_msg)

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            if os.path.basename(str(data['path'])) == '.git':
                continue

            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_message
Ejemplo n.º 8
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError("No such file or directory", param_hint=unused_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        new_files = []

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    based_on.source = url
                else:
                    based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url)

                path_in_dst_repo = dst.relative_to(self.path)

                if path_in_dst_repo in new_files:  # A path with the same destination is already copied
                    continue

                new_files.append(path_in_dst_repo)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, "symlink")
                else:
                    operation = (src, dst, "copy")

                results.append(
                    {
                        "path": path_in_dst_repo,
                        "source": remove_credentials(url),
                        "parent": self,
                        "based_on": based_on,
                        "operation": operation,
                    }
                )

        return results