Beispiel #1
0
    def _add_from_url(self, dataset, url, destination, extract, progress=None):
        """Process an add from url and return the location on disk."""
        if destination.exists() and destination.is_dir():
            u = parse.urlparse(url)
            destination = destination / Path(u.path).name

        try:
            paths = _download(url=url,
                              download_to=destination,
                              extract=extract,
                              progress_class=progress)
        except error.HTTPError as e:  # pragma nocover
            raise errors.OperationError(
                'Cannot download from {}'.format(url)) from e

        # make the added file read-only
        for path in paths:
            mode = path.stat().st_mode & 0o777
            path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

        return [{
            'path': path.relative_to(self.path),
            'url': remove_credentials(url),
            'creator': dataset.creator,
            'parent': self
        } for path in paths]
Beispiel #2
0
    def _add_from_url(self, dataset, url, destination, extract, filename=None, progress=None):
        """Process adding from url and return the location on disk."""
        url = self._provider_check(url)

        try:
            start = time.time() * 1e3
            tmp_root, paths = self._download(url=url, filename=filename, extract=extract, progress_class=progress)

            exec_time = (time.time() * 1e3 - start) // 1e3
            # If execution time was less or equal to zero seconds,
            # block the thread a bit to avoid being rate limited.
            if exec_time == 0:
                time.sleep(min(os.cpu_count() - 1, 4) or 1)

        except (requests.exceptions.HTTPError, error.HTTPError) as e:  # pragma nocover
            raise errors.OperationError("Cannot download from {}".format(url)) from e

        paths = [(src, destination / src.relative_to(tmp_root)) for src in paths if not src.is_dir()]
        return [
            {
                "operation": (src, dst, "move"),
                "path": dst.relative_to(self.path),
                "source": remove_credentials(url),
                "parent": self,
            }
            for src, dst in paths
        ]
    def _add_from_url(self, dataset, url, destination, extract, progress=None):
        """Process adding from url and return the location on disk."""
        if destination.exists() and destination.is_dir():
            u = parse.urlparse(url)
            destination = destination / Path(u.path).name
        else:
            destination.parent.mkdir(parents=True, exist_ok=True)

        url = self.provider_check(url)

        try:
            start = time.time() * 1e+3
            paths = _download(url=url,
                              download_to=destination,
                              extract=extract,
                              progress_class=progress)

            exec_time = (time.time() * 1e+3 - start) // 1e+3
            # If execution time was less or equal to zero seconds,
            # block the thread a bit to avoid being rate limited.
            if exec_time == 0:
                time.sleep(min(os.cpu_count() - 1, 4) or 1)

        except (requests.exceptions.HTTPError,
                error.HTTPError) as e:  # pragma nocover
            raise errors.OperationError(
                'Cannot download from {}'.format(url)) from e

        # make the added file read-only
        for path in paths:
            mode = path.stat().st_mode & 0o777
            path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

        return [{
            'path': path.relative_to(self.path),
            'url': remove_credentials(url),
            'creator': dataset.creator,
            'parent': self
        } for path in paths]
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=unused_sources)

        if destination.exists() and not destination.is_dir():
            if len(files) > 1:
                raise errors.ParameterError(
                    'Cannot copy multiple files or directories to a file')

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url,
                                                         creator=creators)

                path_in_dst_repo = dst.relative_to(self.path)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, 'symlink')
                else:
                    operation = (src, dst, 'copy')

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on,
                    'operation': operation
                })

        return results
def _migrate_submodule_based_datasets(client):
    from renku.core.management import LocalClient
    from renku.core.management.migrate import is_project_unsupported

    submodules = client.repo.submodules
    if not submodules:
        return

    for s in submodules:
        try:
            s.update()
        except GitError:
            pass

    submodules_urls = {s.path: s.url for s in submodules}

    repo_paths = []
    symlinks = []

    for dataset in client.datasets.values():
        for file_ in dataset.files:
            path = client.path / file_.path
            if not path.is_symlink():
                continue

            target = path.resolve()

            if '/.renku/vendors/' not in str(target):
                continue

            repo = Repo(target.parent, search_parent_directories=True)
            repo_path = repo.working_dir
            if repo_path not in repo_paths:
                repo_paths.append(repo_path)

            symlinks.append((file_.path, target, repo_path))

    if not symlinks:
        return

    remote_clients = {p: LocalClient(p) for p in repo_paths}

    for remote_client in remote_clients.values():
        if not is_project_unsupported(remote_client):
            migrate(remote_client)

    metadata = {}

    for path, target, repo_path in symlinks:
        remote_client = remote_clients[repo_path]
        path_within_repo = target.relative_to(repo_path)

        repo_is_remote = '.renku/vendors/local' not in repo_path
        based_on = None
        submodule_path = Path(repo_path).relative_to(client.path)

        url = submodules_urls.get(str(submodule_path), '')

        if repo_is_remote:
            based_on = _fetch_file_metadata(remote_client, path_within_repo)
            if based_on:
                based_on.url = url
                based_on.based_on = None
            else:
                based_on = DatasetFile.from_revision(remote_client,
                                                     path=path_within_repo,
                                                     url=url)
        else:
            if url:
                full_path = Path(url) / path_within_repo
                rel_path = os.path.relpath(full_path, client.path)
                url = f'file://{rel_path}'

        metadata[path] = (based_on, url)

        path = client.path / path
        path.unlink()

        try:
            shutil.move(target, path)
        except FileNotFoundError:
            raise errors.InvalidFileOperation(f'File was not found: {target}')

    for s in submodules:
        if s.path.startswith('.renku/vendors/'):
            try:
                s.remove(force=True)
            except ValueError:
                pass

    for dataset in client.datasets.values():
        for file_ in dataset.files:
            if file_.path in metadata:
                based_on, url = metadata[file_.path]
                file_.based_on = based_on
                file_.url = remove_credentials(url)

        dataset.to_yaml()
Beispiel #6
0
    def commit(self,
               commit_only=None,
               commit_empty=True,
               raise_if_empty=False,
               commit_message=None):
        """Automatic commit."""
        from git import Actor
        from renku.version import __version__, version_url

        diff_before = set()

        if commit_only == COMMIT_DIFF_STRATEGY:
            staged = {item.a_path for item in self.repo.index.diff(None)}

            modified = {item.a_path for item in self.repo.index.diff('HEAD')}

            if staged or modified:
                self.repo.git.reset()

            # Exclude files created by pipes.
            diff_before = {
                file_
                for file_ in self.repo.untracked_files
                if STARTED_AT - int(Path(file_).stat().st_ctime * 1e3) >= 1e3
            }

        if isinstance(commit_only, list):
            for path_ in commit_only:
                self.ensure_untracked(str(path_))
                self.ensure_unstaged(str(path_))

        yield

        committer = Actor('renku {0}'.format(__version__), version_url)

        change_types = {}

        if commit_only == COMMIT_DIFF_STRATEGY:
            # Get diff generated in command.
            change_types = {
                item.a_path: item.change_type
                for item in self.repo.index.diff(None)
            }
            staged_after = set(change_types.keys())

            modified_after_change_types = {
                item.a_path: item.change_type
                for item in self.repo.index.diff('HEAD')
            }

            modified_after = set(modified_after_change_types.keys())

            change_types.update(modified_after_change_types)

            diff_after = set(self.repo.untracked_files)\
                .union(staged_after)\
                .union(modified_after)

            # Remove files not touched in command.
            commit_only = list(diff_after - diff_before)

        if isinstance(commit_only, list):
            for path_ in commit_only:
                p = Path(path_)
                if p.exists() or change_types.get(path_) == 'D':
                    self.repo.git.add(path_)

        if not commit_only:
            self.repo.git.add('--all')

        if not commit_empty and not self.repo.index.diff('HEAD'):
            if raise_if_empty:
                raise errors.NothingToCommit()
            return

        if commit_message and not isinstance(commit_message, str):
            raise errors.CommitMessageEmpty()

        elif not commit_message:
            argv = [os.path.basename(sys.argv[0])
                    ] + [remove_credentials(arg) for arg in sys.argv[1:]]

            commit_message = ' '.join(argv)

        # Ignore pre-commit hooks since we have already done everything.
        self.repo.index.commit(
            commit_message,
            committer=committer,
            skip_hooks=True,
        )
Beispiel #7
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ('checksum', None),
                        ('filename', 'name'),
                        ('size_in_mb', 'size (mb)'),
                        ('filetype', 'type'),
                    )),
                    floatfmt='.2f'
                )
            )

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')
                ) + text_prompt

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError((
            'Could not process {0}.\n'
            'Unable to fetch metadata due to {1}'.format(uri, e)
        ))

    except LookupError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Reason: {1}'.format(uri, str(e)))
        )

    if not files:
        raise ParameterError('Dataset {} has no files.'.format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not short_name:
            short_name = generate_default_short_name(
                dataset.name, dataset.version
            )

        if is_doi(dataset.identifier):
            dataset.same_as = Url(
                url_str=urllib.parse.
                urljoin('https://doi.org', dataset.identifier)
            )

        urls, names = zip(*[(f.url, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version)
            )
    else:
        short_name = short_name or dataset.short_name

        _add_to_dataset(
            client,
            urls=[record.project_url],
            short_name=short_name,
            sources=[f.path for f in files],
            with_metadata=dataset,
            create=True
        )
Beispiel #8
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self._prepare_git_repo(url, ref)
        copied_sources = set()
        files = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources,
                                           destination)

            if result:
                files.add(result)
                source = result[3]
                copied_sources.add(source)

        uncopied_sources = sources - copied_sources
        if uncopied_sources:
            uncopied_sources = {str(s) for s in uncopied_sources}
            raise errors.ParameterError('No such file or directory',
                                        param_hint=uncopied_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _, __ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                path = str(src.resolve().relative_to(repo_path))
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        for path, src, dst, _ in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    creators = based_on.creator
                else:
                    creators = []
                    # grab all the creators from the commit history
                    for commit in repo.iter_commits(paths=path):
                        creator = Person.from_commit(commit)
                        if creator not in creators:
                            creators.append(creator)

                    based_on = DatasetFile.from_revision(remote_client,
                                                         path=path,
                                                         url=url)

                path_in_dst_repo = dst.relative_to(self.path)

                results.append({
                    'path': path_in_dst_repo,
                    'url': remove_credentials(url),
                    'creator': creators,
                    'parent': self,
                    'based_on': based_on
                })

                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy(str(src), str(dst))

        return results
Beispiel #9
0
def import_dataset(
    client,
    uri,
    name="",
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    u = urllib.parse.urlparse(uri)
    if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"):
        # NOTE: Check if the url is a redirect.
        uri = requests.head(uri, allow_redirects=True).url

    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError("Could not process {0}.\n{1}".format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ("checksum", None),
                        ("filename", "name"),
                        ("size_in_mb", "size (mb)"),
                        ("filetype", "type"),
                    )),
                    floatfmt=".2f",
                ))

            text_prompt = "Do you wish to download this version?"
            if record.is_last_version(uri) is False:
                text_prompt = (WARNING + "Newer version found at {}\n".format(
                    record.links.get("latest_html")) + text_prompt)

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError(
            ("Could not process {0}.\n"
             "Unable to fetch metadata due to {1}".format(uri, e)))

    except LookupError as e:
        raise ParameterError(("Could not process {0}.\n"
                              "Reason: {1}".format(uri, str(e))))

    if not files:
        raise ParameterError("Dataset {} has no files.".format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not name:
            name = generate_default_name(dataset.title, dataset.version)

        if is_doi(dataset.identifier):
            dataset.same_as = Url(url_str=urllib.parse.urljoin(
                "https://doi.org", dataset.identifier))

        urls, names = zip(*[(f.source, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            name=name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version)
            tag_dataset(
                client, name, tag_name,
                "Tag {} created by renku import".format(dataset.version))
    else:
        name = name or dataset.name

        if not dataset.data_dir:
            raise OperationError(
                f"Data directory for dataset must be set: {dataset.name}")

        sources = [f"{dataset.data_dir}/**"]
        for file_ in dataset.files:
            try:
                Path(file_.path).relative_to(dataset.data_dir)
            except ValueError:  # Files that are not in dataset's data directory
                sources.append(file_.path)

        _add_to_dataset(
            client,
            urls=[record.project_url],
            name=name,
            sources=sources,
            with_metadata=dataset,
            create=True,
        )
Beispiel #10
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri)
        dataset = record.as_dataset(client)
        files = dataset.files

        if with_prompt:
            click.echo(
                tabulate(files,
                         headers=OrderedDict((
                             ('checksum', None),
                             ('filename', 'name'),
                             ('size_in_mb', 'size (mb)'),
                             ('filetype', 'type'),
                         ))))

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')) + text_prompt

            click.confirm(text_prompt, abort=True)

    except KeyError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Unable to fetch metadata due to {1}'.format(uri, e)))

    except LookupError:
        raise ParameterError(('Could not process {0}.\n'
                              'URI not found.'.format(uri)))

    if files:
        if not short_name:
            short_name = generate_default_short_name(dataset.name,
                                                     dataset.version)

        dataset.url = remove_credentials(dataset.url)

        add_to_dataset(
            client,
            urls=[f.url for f in files],
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            progress=progress,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version))
Beispiel #11
0
    def _add_from_git(self, dataset, url, sources, destination, ref):
        """Process adding resources from another git repository."""
        from renku import LocalClient

        u = parse.urlparse(url)

        sources = self._resolve_paths(u.path, sources)

        # Get all files from repo that match sources
        repo, repo_path = self.prepare_git_repo(url, ref)
        files = set()
        used_sources = set()
        for file in repo.head.commit.tree.traverse():
            path = file.path
            result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources)

            if result:
                files.add(result)

        unused_sources = set(sources.keys()) - used_sources
        if unused_sources:
            unused_sources = {str(s) for s in unused_sources}
            raise errors.ParameterError("No such file or directory", param_hint=unused_sources)

        # Create metadata and move files to dataset
        results = []
        remote_client = LocalClient(repo_path)

        # Pull files from LFS
        paths = set()
        for path, src, _ in files:
            if src.is_dir():
                continue
            if src.is_symlink():
                try:
                    path = str(src.resolve().relative_to(repo_path))
                except ValueError:  # External file
                    pass
            paths.add(path)
        self._fetch_lfs_files(repo_path, paths)

        # Fetch metadata from Renku if any
        paths = {f[0] for f in files}
        metadata = self._fetch_files_metadata(remote_client, paths)

        new_files = []

        for path, src, dst in files:
            if not src.is_dir():
                # Use original metadata if it exists
                based_on = metadata.get(path)
                if based_on:
                    based_on.url = url
                    based_on.based_on = None
                    based_on.source = url
                else:
                    based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url)

                path_in_dst_repo = dst.relative_to(self.path)

                if path_in_dst_repo in new_files:  # A path with the same destination is already copied
                    continue

                new_files.append(path_in_dst_repo)

                if remote_client._is_external_file(src):
                    operation = (src.resolve(), dst, "symlink")
                else:
                    operation = (src, dst, "copy")

                results.append(
                    {
                        "path": path_in_dst_repo,
                        "source": remove_credentials(url),
                        "parent": self,
                        "based_on": based_on,
                        "operation": operation,
                    }
                )

        return results