Example #1
0
    def _query_knowledge_graph(url):
        try:
            response = requests.get(url)
        except urllib.error.HTTPError as e:
            raise errors.OperationError("Cannot access knowledge graph: {}".format(url)) from e
        if response.status_code != 200:
            raise errors.OperationError(
                "Cannot access knowledge graph: {}\nResponse code: {}".format(url, response.status_code)
            )

        return response.json()
Example #2
0
    def _add_from_url(self, dataset, url, destination, extract, progress=None):
        """Process an add from url and return the location on disk."""
        if destination.exists() and destination.is_dir():
            u = parse.urlparse(url)
            destination = destination / Path(u.path).name

        try:
            paths = _download(url=url,
                              download_to=destination,
                              extract=extract,
                              progress_class=progress)
        except error.HTTPError as e:  # pragma nocover
            raise errors.OperationError(
                'Cannot download from {}'.format(url)) from e

        # make the added file read-only
        for path in paths:
            mode = path.stat().st_mode & 0o777
            path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

        return [{
            'path': path.relative_to(self.path),
            'url': remove_credentials(url),
            'creator': dataset.creator,
            'parent': self
        } for path in paths]
Example #3
0
    def _add_from_url(self, dataset, url, destination, extract, filename=None, progress=None):
        """Process adding from url and return the location on disk."""
        url = self._provider_check(url)

        try:
            start = time.time() * 1e3
            tmp_root, paths = self._download(url=url, filename=filename, extract=extract, progress_class=progress)

            exec_time = (time.time() * 1e3 - start) // 1e3
            # If execution time was less or equal to zero seconds,
            # block the thread a bit to avoid being rate limited.
            if exec_time == 0:
                time.sleep(min(os.cpu_count() - 1, 4) or 1)

        except (requests.exceptions.HTTPError, error.HTTPError) as e:  # pragma nocover
            raise errors.OperationError("Cannot download from {}".format(url)) from e

        paths = [(src, destination / src.relative_to(tmp_root)) for src in paths if not src.is_dir()]
        return [
            {
                "operation": (src, dst, "move"),
                "path": dst.relative_to(self.path),
                "source": remove_credentials(url),
                "parent": self,
            }
            for src, dst in paths
        ]
Example #4
0
 def _create_external_file(self, src, dst):
     """Create a new external file."""
     try:
         pointer_file = self._create_pointer_file(target=src)
         relative = os.path.relpath(pointer_file, dst.parent)
         os.symlink(relative, dst)
     except OSError as e:
         raise errors.OperationError("Could not create symbolic link") from e
Example #5
0
    def _add_from_url(self, dataset, url, destination, extract, progress=None):
        """Process adding from url and return the location on disk."""
        if destination.exists() and destination.is_dir():
            u = parse.urlparse(url)
            destination = destination / Path(u.path).name
        else:
            destination.parent.mkdir(parents=True, exist_ok=True)

        url = self.provider_check(url)

        try:
            start = time.time() * 1e+3
            paths = _download(url=url,
                              download_to=destination,
                              extract=extract,
                              progress_class=progress)

            exec_time = (time.time() * 1e+3 - start) // 1e+3
            # If execution time was less or equal to zero seconds,
            # block the thread a bit to avoid being rate limited.
            if exec_time == 0:
                time.sleep(min(os.cpu_count() - 1, 4) or 1)

        except (requests.exceptions.HTTPError,
                error.HTTPError) as e:  # pragma nocover
            raise errors.OperationError(
                'Cannot download from {}'.format(url)) from e

        # make the added file read-only
        for path in paths:
            mode = path.stat().st_mode & 0o777
            path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

        return [{
            'path': path.relative_to(self.path),
            'url': remove_credentials(url),
            'creator': dataset.creator,
            'parent': self
        } for path in paths]
Example #6
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            sources=(),
                            destination='',
                            ref=None,
                            link=False,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        warning_message = ''
        dataset_path = self.path / self.datadir / dataset.short_name

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_path, destination)
        destination = self.path / dataset_path / destination

        files = []
        if all_at_once:  # only for URLs
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset, url, sources,
                                                   destination, ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_message = 'Adding data from local Git ' \
                                'repository. Use remote\'s Git URL instead ' \
                                'to enable lineage information and updates.'
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset, u.path, link, external, destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset,
                                                       url,
                                                       destination,
                                                       extract,
                                                       progress=progress)

                files.extend(new_files)

        files_to_commit = {f['path'] for f in files if f['path']}
        ignored = self.find_ignored_paths(*files_to_commit)

        if not force:
            if ignored:
                raise errors.IgnoredFiles(ignored)
            if dataset.contains_any(files):
                raise errors.DatasetFileExists()

        # all files at this point can be force-added and overwritten

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'link':
                try:
                    os.link(src, dst)
                except Exception as e:
                    raise errors.OperationError(
                        'Could not create hard link. Retry without "--link."'
                    ) from e
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True

        # Track non-symlinks in LFS
        self.track_paths_in_storage(*files_to_commit)

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            self.repo.index.commit(msg)

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            if os.path.basename(str(data['path'])) == '.git':
                continue

            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_message
Example #7
0
    def add_data_to_dataset(self,
                            dataset,
                            urls,
                            force=False,
                            overwrite=False,
                            sources=(),
                            destination='',
                            ref=None,
                            external=False,
                            extract=False,
                            all_at_once=False,
                            destination_names=None,
                            progress=None):
        """Import the data into the data directory."""
        messages = []
        warning_messages = []
        dataset_datadir = self.path / dataset.data_dir

        destination = destination or Path('.')
        destination = self._resolve_path(dataset_datadir, destination)
        destination = self.path / dataset_datadir / destination

        if destination.exists() and not destination.is_dir():
            raise errors.ParameterError(
                f'Destination is not a directory: "{destination}"')

        self.check_external_storage()

        files = []
        if all_at_once:  # Importing a dataset
            files = self._add_from_urls(dataset=dataset,
                                        urls=urls,
                                        destination_names=destination_names,
                                        destination=destination,
                                        extract=extract,
                                        progress=progress)
        else:
            for url in urls:
                is_remote, is_git = _check_url(url)

                if is_git and is_remote:  # Remote git repo
                    sources = sources or ()
                    new_files = self._add_from_git(dataset=dataset,
                                                   url=url,
                                                   sources=sources,
                                                   destination=destination,
                                                   ref=ref)
                else:
                    if sources:
                        raise errors.UsageError(
                            'Cannot use "--source" with URLs or local files.')

                    if not is_remote:  # Local path, might be git
                        if is_git:
                            warning_messages.append(
                                'Adding data from local Git repository: ' +
                                'Use remote\'s Git URL instead to enable ' +
                                'lineage information and updates.')
                        u = parse.urlparse(url)
                        new_files = self._add_from_local(
                            dataset=dataset,
                            path=u.path,
                            external=external,
                            destination=destination)
                    else:  # Remote URL
                        new_files = self._add_from_url(dataset=dataset,
                                                       url=url,
                                                       destination=destination,
                                                       extract=extract,
                                                       progress=progress)

                files.extend(new_files)

        # Remove all files that are under a .git directory
        paths_to_avoid = [
            f['path'] for f in files
            if '.git' in str(f['path']).split(os.path.sep)
        ]
        if paths_to_avoid:
            files = [f for f in files if f['path'] not in paths_to_avoid]
            warning_messages.append(
                'Ignored adding paths under a .git directory:\n  ' +
                '\n  '.join(str(p) for p in paths_to_avoid))

        files_to_commit = {str(self.path / f['path']) for f in files}

        if not force:
            ignored_files = self.find_ignored_paths(*files_to_commit)
            if ignored_files:
                ignored_files = set(ignored_files)
                files_to_commit = files_to_commit.difference(ignored_files)
                ignored_sources = []
                for file_ in files:
                    if str(self.path / file_['path']) in ignored_files:
                        operation = file_.get('operation')
                        if operation:
                            src, _, _ = operation
                            ignored_sources.append(src)
                        else:
                            ignored_sources.append(file_['path'])

                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'Theses paths are ignored by one of your .gitignore ' +
                    'files (use "--force" flag if you really want to add ' +
                    'them):\n  ' +
                    '\n  '.join([str(p) for p in ignored_sources]))

        # all files at this point can be force-added

        if not overwrite:
            existing_files = dataset.find_files(files_to_commit)
            if existing_files:
                files_to_commit = files_to_commit.difference(existing_files)
                files = [
                    f for f in files
                    if str(self.path / f['path']) in files_to_commit
                ]
                warning_messages.append(
                    'These existing files were not overwritten ' +
                    '(use "--overwrite" flag to overwrite them):\n  ' +
                    '\n  '.join([str(p) for p in existing_files]))

        for data in files:
            operation = data.pop('operation', None)
            if not operation:
                continue

            src, dst, action = operation

            # Remove existing file if any
            self.remove_file(dst)
            dst.parent.mkdir(parents=True, exist_ok=True)

            if action == 'copy':
                shutil.copy(src, dst)
            elif action == 'move':
                shutil.move(src, dst, copy_function=shutil.copy)
            elif action == 'symlink':
                self._create_external_file(src, dst)
                data['external'] = True
            else:
                raise errors.OperationError(f'Invalid action {action}')

        # Track non-symlinks in LFS
        if self.check_external_storage():
            lfs_paths = self.track_paths_in_storage(*files_to_commit)
            show_message = self.get_value('renku', 'show_lfs_message')
            if (lfs_paths
                    and (show_message is None or show_message == 'True')):
                messages.append(
                    ('Adding these files to Git LFS:\n' +
                     '\t{}'.format('\n\t'.join(lfs_paths)) +
                     '\nTo disable this message in the future, run:' +
                     '\n\trenku config show_lfs_message False'))

        # Force-add to include possible ignored files
        self.repo.git.add(*files_to_commit, force=True)
        self.repo.git.add(self.renku_pointers_path, force=True)

        staged_files = self.repo.index.diff('HEAD')
        if staged_files:
            msg = 'renku dataset: committing {} newly added files'.format(
                len(files_to_commit))
            skip_hooks = not self.external_storage_requested
            self.repo.index.commit(msg, skip_hooks=skip_hooks)
        else:
            warning_messages.append('No file was added to project')

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            dataset_file = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to root for submodules.
            if dataset_file.client != self:
                dataset_file.path = str(data['path'])
            dataset_files.append(dataset_file)

        dataset.update_files(dataset_files)
        return warning_messages, messages
Example #8
0
    def _add_from_local(self, dataset, path, link, destination):
        """Add a file or directory from local filesystem."""
        src = Path(path).resolve()

        if not src.exists():
            raise errors.ParameterError(
                'Cannot find file/directory: {}'.format(path))

        if destination.exists() and destination.is_dir():
            destination = destination / src.name

        # if we have a directory, recurse
        if src.is_dir():
            if destination.exists() and not destination.is_dir():
                raise errors.ParameterError('Cannot copy directory to a file')

            if src.name == '.git':
                # Cannot have a '.git' directory inside a Git repo
                return []

            files = []
            destination.mkdir(parents=True, exist_ok=True)
            for f in src.iterdir():
                files.extend(
                    self._add_from_local(dataset,
                                         f.absolute().as_posix(),
                                         link=link,
                                         destination=destination))
            return files
        else:
            # Check if file is in the project and return it
            try:
                path_in_repo = src.relative_to(self.path)
            except ValueError:
                pass
            else:
                return [{
                    'path': path_in_repo,
                    'url': path_in_repo,
                    'creator': dataset.creator,
                    'parent': self
                }]

        # Make sure the parent directory exists.
        destination.parent.mkdir(parents=True, exist_ok=True)

        if link:
            try:
                os.link(str(src), str(destination))
            except Exception as e:
                raise errors.OperationError('Could not create hard link '
                                            '- retry without --link.') from e
        else:
            shutil.copy(str(src), str(destination))

        return [{
            'path': destination.relative_to(self.path),
            'url': 'file://' + os.path.relpath(str(src), str(self.path)),
            'creator': dataset.creator,
            'parent': self
        }]