Exemple #1
0
def _construct_creators(creators, ignore_email=False):
    from collections.abc import Iterable

    creators = creators or ()

    if not isinstance(creators, Iterable) or isinstance(creators, str):
        raise errors.ParameterError("Invalid type")

    people = []
    no_email_warnings = []
    for creator in creators:
        if isinstance(creator, str):
            person = Person.from_string(creator)
        elif isinstance(creator, dict):
            person = Person.from_dict(creator)
        else:
            raise errors.ParameterError("Invalid type")

        message = 'A valid format is "Name <email> [affiliation]"'

        if not person.name:  # pragma: no cover
            raise errors.ParameterError(
                f'Name is invalid: "{creator}".\n{message}')

        if not person.email:
            if not ignore_email:  # pragma: no cover
                raise errors.ParameterError(
                    f'Email is invalid: "{creator}".\n{message}')
            else:
                no_email_warnings.append(creator)

        people.append(person)

    return people, no_email_warnings
Exemple #2
0
    def find_record(self, uri, client=None):
        """Retrieves a dataset from Renku.

        :raises: ``LookupError``
        :param uri: URL
        :return: ``DataverseRecord``
        """
        from renku.core.management import LocalClient

        same_as, kg_urls = self._get_dataset_info(uri)
        project_url = None
        failed_urls = []

        for kg_url in kg_urls:
            kg_datasets_url, ssh_url, https_url = self._get_project_urls(kg_url)

            # Check if the project contains the dataset
            if same_as is None:  # Dataset is in the project
                dataset_id = self._extract_dataset_id(uri)
            else:  # Dataset is sameAs one of the datasets in the project
                datasets = self._query_knowledge_graph(kg_datasets_url)

                ids = [ds["identifier"] for ds in datasets if ds["sameAs"] == same_as]
                if not ids:
                    continue
                dataset_id = ids[0]

            # Check if we can clone the project
            for url in (ssh_url, https_url):
                try:
                    repo, repo_path = client.prepare_git_repo(url)
                except errors.GitError:
                    failed_urls.append(url)
                else:
                    project_url = url
                    break
            if project_url is not None:
                break

        if project_url is None:
            if failed_urls:
                message = "Cannot clone remote projects:\n\t" + "\n\t".join(failed_urls)
            else:
                message = "Cannot find any project for the dataset."

            raise errors.ParameterError(message, param_hint=uri)

        remote_client = LocalClient(repo_path)
        self._migrate_project(remote_client)

        datasets = [d for d in remote_client.datasets.values() if urllib.parse.quote(d.uid, safe="") == dataset_id]

        if len(datasets) == 0:
            raise errors.ParameterError(
                'Cannot find dataset with id "{}" in project "{}"'.format(dataset_id, project_url)
            )
        if len(datasets) > 1:
            raise errors.ParameterError('Found multiple datasets with id "{}"'.format(dataset_id))

        return _RenkuRecordSerializer(datasets[0], project_url, remote_client)
Exemple #3
0
    def create_dataset(
        self,
        short_name=None,
        title=None,
        description=None,
        creators=None,
        keywords=None,
    ):
        """Create a dataset."""
        if not short_name:
            raise errors.ParameterError('Dataset short_name must be provided.')

        if not is_dataset_short_name_valid(short_name):
            raise errors.ParameterError(
                'Dataset short_name "{}" is not valid.'.format(short_name))

        if self.load_dataset(short_name=short_name):
            raise errors.DatasetExistsError(
                'Dataset exists: "{}".'.format(short_name))

        if not title:
            title = short_name

        identifier = str(uuid.uuid4())

        path = self.renku_datasets_path / identifier / self.METADATA

        if path.exists():
            raise errors.DatasetExistsError(
                'Dataset with reference {} exists'.format(path))

        path.parent.mkdir(parents=True, exist_ok=True)

        if creators is None:
            creators = [Person.from_git(self.repo)]

        keywords = keywords or ()

        with with_reference(path):
            dataset = Dataset(
                client=self,
                identifier=identifier,
                short_name=short_name,
                name=title,
                description=description,
                creator=creators,
                keywords=keywords,
            )

        dataset_ref = LinkReference.create(client=self,
                                           name='datasets/' + short_name)

        dataset_ref.set_reference(path)
        dataset.path = Path(dataset.path).relative_to(self.path)
        dataset.to_yaml()

        return dataset, path, dataset_ref
Exemple #4
0
    def _add_from_local(self, dataset, path, external, destination):
        """Add a file or directory from a local filesystem."""
        src = Path(os.path.abspath(path))

        if not src.exists():
            raise errors.ParameterError(f"Cannot find file/directory: {path}")

        dst = destination / src.name

        # if we have a directory, recurse
        if src.is_dir():
            if dst.exists() and not dst.is_dir():
                raise errors.ParameterError(f'Cannot copy directory to a file: "{dst}"')
            if src == (self.path / dataset.data_dir).resolve():
                raise errors.ParameterError(f"Cannot add dataset's data directory recursively: {path}")

            if self._check_protected_path(src):
                raise errors.ProtectedFiles([src])

            files = []
            for f in src.iterdir():
                files.extend(
                    self._add_from_local(dataset=dataset, path=os.path.abspath(f), external=external, destination=dst)
                )
            return files
        else:
            # Check if file is in the project and return it
            path_in_repo = None
            if self._is_external_file(src):
                path_in_repo = path
            else:
                try:
                    path_in_repo = src.relative_to(self.path)
                except ValueError:
                    pass
                else:
                    if self._check_protected_path(src):
                        raise errors.ProtectedFiles([src])

            if path_in_repo:
                return [{"path": path_in_repo, "source": path_in_repo, "parent": self}]

        action = "symlink" if external else "copy"
        return [
            {
                "path": dst.relative_to(self.path),
                "source": os.path.relpath(str(src), str(self.path)),
                "parent": self,
                "operation": (src, dst, action),
            }
        ]
Exemple #5
0
    def create_dataset(self,
                       name,
                       short_name=None,
                       description='',
                       creators=None):
        """Create a dataset."""
        if not name:
            raise errors.ParameterError('Dataset name must be provided.')

        if not short_name:
            short_name = generate_default_short_name(name, None)

        if not is_dataset_name_valid(short_name):
            raise errors.ParameterError(
                'Dataset name "{}" is not valid.'.format(short_name))

        if self.load_dataset(name=short_name):
            raise errors.DatasetExistsError(
                'Dataset exists: "{}".'.format(short_name))

        identifier = str(uuid.uuid4())

        path = self.renku_datasets_path / identifier / self.METADATA

        if path.exists():
            raise errors.DatasetExistsError(
                'Dataset with reference {} exists'.format(path))

        path.parent.mkdir(parents=True, exist_ok=True)

        if creators is None:
            creators = [Person.from_git(self.repo)]

        with with_reference(path):
            dataset = Dataset(client=self,
                              identifier=identifier,
                              name=name,
                              short_name=short_name,
                              description=description,
                              creator=creators)

        dataset_ref = LinkReference.create(client=self,
                                           name='datasets/' + short_name)

        dataset_ref.set_reference(path)
        dataset.to_yaml()

        return dataset, path, dataset_ref
Exemple #6
0
    def list_unpushed_lfs_paths(self, client=None):
        """List paths tracked in lfs for a client."""
        client = client or self

        if (len(client.repo.remotes) < 1
                or not client.repo.active_branch.tracking_branch()):
            raise errors.ConfigurationError(
                'No git remote is configured for {} branch {}.'.format(
                    client.path, client.repo.active_branch.name) +
                'Cleaning the storage cache would lead to a loss of data as ' +
                'it is not on a server. Please see ' +
                'https://www.atlassian.com/git/tutorials/syncing for ' +
                'information on how to sync with a remote.')
        try:
            status = check_output(self._CMD_STORAGE_STATUS,
                                  cwd=client.path,
                                  encoding='UTF-8')
        except (KeyboardInterrupt, OSError) as e:
            raise errors.ParameterError(
                'Couldn\'t run \'git lfs\':\n{0}'.format(e))

        files = status.split('Objects to be committed:')[0].splitlines()[2:]
        files = [
            client.path / f.rsplit('(', 1)[0].strip() for f in files
            if f.strip()
        ]
        return files
Exemple #7
0
    def from_string(cls, string):
        """Create an instance from a 'Name <email>' string."""
        regex_pattern = r'([^<]*)<{0,1}([^@<>]+@[^@<>]+\.[^@<>]+)*>{0,1}'
        name, email = re.search(regex_pattern, string).groups()
        name = name.rstrip()

        # Check the git configuration.
        if not name:  # pragma: no cover
            raise errors.ParameterError(
                'Name is invalid: A valid format is "Name <email>"')

        if not email:  # pragma: no cover
            raise errors.ParameterError(
                'Email is invalid: A valid format is "Name <email>"')

        return cls(name=name, email=email)
 def short_name_validator(self, attribute, value):
     """Validate short_name."""
     # short_name might have been scaped and have '%' in it
     if value and not is_dataset_short_name_valid(value):
         raise errors.ParameterError(
             'Invalid "short_name": {}'.format(value)
         )
Exemple #9
0
    def track_paths_in_storage(self, *paths):
        """Track paths in the external storage."""
        # Calculate which paths can be tracked in lfs
        track_paths = []
        attrs = self.find_attr(*paths)

        for path in paths:
            # Do not add files with filter=lfs in .gitattributes
            if attrs.get(path, {}).get('filter') == 'lfs':
                continue

            path = Path(path)
            if path.is_dir():
                track_paths.append(str(path / '**'))
            elif path.suffix != '.ipynb':
                # TODO create configurable filter and follow .gitattributes
                track_paths.append(str(path))

        if track_paths:
            try:
                call(
                    self._CMD_STORAGE_TRACK + track_paths,
                    stdout=PIPE,
                    stderr=STDOUT,
                    cwd=str(self.path),
                )
            except (KeyboardInterrupt, OSError) as e:
                raise errors.ParameterError(
                    'Couldn\'t run \'git lfs\':\n{0}'.format(e)
                )
 def checkout(repo, ref):
     try:
         repo.git.checkout(ref)
     except GitCommandError:
         raise errors.ParameterError(
             'Cannot find reference "{}" in Git repository: {}'.format(
                 ref, url))
Exemple #11
0
 def init_external_storage(self, force=False):
     """Initialize the external storage for data."""
     try:
         call(
             self._CMD_STORAGE_INSTALL + (["--force"] if force else []), stdout=PIPE, stderr=STDOUT, cwd=self.path,
         )
     except (KeyboardInterrupt, OSError) as e:
         raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e))
Exemple #12
0
 def untrack_paths_from_storage(self, *paths):
     """Untrack paths from the external storage."""
     try:
         call(
             self._CMD_STORAGE_UNTRACK + list(paths), stdout=PIPE, stderr=STDOUT, cwd=self.path,
         )
     except (KeyboardInterrupt, OSError) as e:
         raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e))
Exemple #13
0
 def _resolve_path(self, root_path, path):
     """Check if a path is within a root path and resolve it."""
     try:
         root_path = Path(root_path).resolve()
         return (root_path / path).resolve().relative_to(root_path)
     except ValueError:
         raise errors.ParameterError('File {} is not within path {}'.format(
             path, root_path))
Exemple #14
0
def raise_template_error(value):
    """Raise template error with short explanation."""
    error_info = [
        '{0}'.format(value), 'Tip: a dictionary is expected',
        ('Example: --template-variables '
         '\'{ "variable_1": "string", "variable_2": 2 }\'')
    ]
    raise errors.ParameterError('\n'.join(error_info),
                                '"--template-variables"')
Exemple #15
0
 def list_tracked_paths(self, client=None):
     """List paths tracked in lfs for a client."""
     client = client or self
     try:
         files = check_output(self._CMD_STORAGE_LIST, cwd=client.path, encoding="UTF-8")
     except (KeyboardInterrupt, OSError) as e:
         raise errors.ParameterError("Couldn't run 'git lfs':\n{0}".format(e))
     files = [client.path / f for f in files.splitlines()]
     return files
 def fmt_path(path):
     """Format path as relative to the client path."""
     abs_path = os.path.abspath(client.path / path)
     try:
         return str(Path(abs_path).relative_to(client.path))
     except ValueError:
         raise errors.ParameterError(
             f'File {abs_path} is not within the project.'
         )
Exemple #17
0
def resolve_data_directory(data_dir, path):
    """Check data directory is within the project path."""
    if not data_dir:
        return

    absolute_data_dir = (Path(path) / data_dir).resolve()

    try:
        data_dir = absolute_data_dir.relative_to(path)
    except ValueError:
        raise errors.ParameterError(
            f"Data directory {data_dir} is not within project {path}")

    if str(data_dir).rstrip(os.path.sep) in INVALID_DATA_DIRS:
        raise errors.ParameterError(
            f"Cannot use {data_dir} as data directory.")

    return data_dir
Exemple #18
0
    def remove_dataset_tags(self, dataset, tags):
        """Removes tags from a dataset."""
        tag_names = {t.name for t in dataset.tags}
        not_found = set(tags).difference(tag_names)

        if len(not_found) > 0:
            raise errors.ParameterError("Tags {} not found".format(", ".join(not_found)))
        dataset.tags = [t for t in dataset.tags if t.name not in tags]

        return dataset
Exemple #19
0
def update_config(client, key, *, value=None, remove=False, global_only=False, commit_message=None):
    """Add, update, or remove configuration values."""
    section, section_key = _split_section_and_key(key)
    if remove:
        value = client.remove_value(section, section_key, global_only=global_only)
        if value is None:
            raise errors.ParameterError('Key "{}" not found.'.format(key))
    else:
        client.set_value(section, section_key, value, global_only=global_only)
        return value
Exemple #20
0
def read_config(client, key, local_only, global_only):
    """Read configuration."""
    if key:
        section, section_key = _split_section_and_key(key)
        value = client.get_value(section, section_key, local_only=local_only, global_only=global_only)
        if value is None:
            raise errors.ParameterError('Key "{}" not found.'.format(key))
        return value

    return client.get_config(local_only=local_only, global_only=global_only)
Exemple #21
0
def _make_headers(columns, columns_mapping):
    headers = OrderedDict()
    for column in columns:
        if column not in columns_mapping:
            raise errors.ParameterError(
                'Invalid column name: "{}".\nPossible values: {}'.format(
                    column, ', '.join(columns_mapping)))
        name, display_name = columns_mapping.get(column)
        headers[name] = display_name

    return headers
Exemple #22
0
def parse_parameters(ctx, param, value):
    """Parse parameters to dictionary."""
    parameters = {}
    for parameter in value:
        splitted = parameter.split("=", 1)
        if len(splitted) < 2 or len(splitted[0]) < 1:
            raise errors.ParameterError(
                'Parameter format must be --parameter "param1"="value". ',
                f'--parameter "{parameter}"')
        parameters[splitted[0]] = splitted[1]
    return parameters
Exemple #23
0
    def _check_config_is_not_readonly(self, section, key):
        from renku.core import errors
        readonly_configs = {'renku': [self.DATA_DIR_CONFIG_KEY]}

        value = self.get_value(section, key, local_only=True)
        if not value:
            return

        if key in readonly_configs.get(section, []):
            raise errors.ParameterError(
                f'Configuration {key} cannot be modified.')
Exemple #24
0
 def init_external_storage(self, force=False):
     """Initialize the external storage for data."""
     try:
         call(
             self._CMD_STORAGE_INSTALL + (['--force'] if force else []),
             stdout=PIPE,
             stderr=STDOUT,
             cwd=str(self.path.absolute()),
         )
     except (KeyboardInterrupt, OSError) as e:
         raise errors.ParameterError(
             'Couldn\'t run \'git lfs\':\n{0}'.format(e)
         )
    def set_parameters(self, client, *, dataverse_server_url, dataverse_name,
                       **kwargs):
        """Set and validate required parameters for a provider."""
        CONFIG_BASE_URL = 'server_url'

        if not dataverse_server_url:
            dataverse_server_url = client.get_value('dataverse',
                                                    CONFIG_BASE_URL)
        else:
            client.set_value('dataverse',
                             CONFIG_BASE_URL,
                             dataverse_server_url,
                             global_only=True)

        if not dataverse_server_url:
            raise errors.ParameterError('Dataverse server URL is required.')

        if not dataverse_name:
            raise errors.ParameterError('Dataverse name is required.')

        self._server_url = dataverse_server_url
        self._dataverse_name = dataverse_name
Exemple #26
0
    def _download(self,
                  url,
                  filename,
                  extract,
                  progress_class=None,
                  chunk_size=16384):
        def extract_dataset(filepath):
            """Extract downloaded file."""
            try:
                tmp = tempfile.mkdtemp()
                patoolib.extract_archive(str(filepath),
                                         outdir=tmp,
                                         verbosity=-1)
            except patoolib.util.PatoolError:
                return filepath.parent, [filepath]
            else:
                filepath.unlink()
                return Path(tmp), [p for p in Path(tmp).rglob('*')]

        tmp_root = self.renku_path / self.CACHE
        tmp_root.mkdir(parents=True, exist_ok=True)
        tmp = tempfile.mkdtemp(dir=tmp_root)

        with requests.get(url, stream=True) as request:
            request.raise_for_status()

            if not filename:
                u = parse.urlparse(url)
                filename = Path(u.path).name
                if not filename:
                    raise errors.ParameterError(
                        f'URL Cannot find a file to download from {url}')

            download_to = Path(tmp) / filename
            with open(str(download_to), 'wb') as file_:
                total_size = int(request.headers.get('content-length', 0))
                progress_class = progress_class or DownloadProgressCallback
                progress = progress_class(description=filename,
                                          total_size=total_size)

                try:
                    for chunk in request.iter_content(chunk_size=chunk_size):
                        if chunk:  # ignore keep-alive chunks
                            file_.write(chunk)
                            progress.update(size=len(chunk))
                finally:
                    progress.finalize()
        if extract:
            return extract_dataset(download_to)

        return download_to.parent, [download_to]
Exemple #27
0
    def _get_src_and_dst(self, path, repo_path, sources, dst_root):
        if not sources:
            source = Path('.')
        else:
            source = None
            for s in sources:
                try:
                    Path(path).relative_to(s)
                except ValueError:
                    pass
                else:
                    source = s
                    break

            if not source:
                return

        src = repo_path / path
        source_name = Path(source).name
        relative_path = Path(path).relative_to(source)

        if not dst_root.exists():
            if len(sources) == 1:
                dst = dst_root / relative_path
            else:  # Treat destination as a directory
                dst = dst_root / source_name / relative_path
        elif dst_root.is_dir():
            dst = dst_root / source_name / relative_path
        else:  # Destination is an existing file
            if len(sources) == 1 and not src.is_dir():
                dst = dst_root
            elif not sources:
                raise errors.ParameterError('Cannot copy repo to file')
            else:
                raise errors.ParameterError(
                    'Cannot copy multiple files or directories to a file')

        return (path, src, dst, source)
Exemple #28
0
    def add_dataset_tag(self, dataset, tag, description='', force=False):
        """Adds a new tag to a dataset.

        Validates if the tag already exists and that the tag follows
        the same rules as docker tags.
        See https://docs.docker.com/engine/reference/commandline/tag/
        for a documentation of docker tag syntax.

        :raises: errors.ParameterError
        """
        if len(tag) > 128:
            raise errors.ParameterError(
                'Tags can be at most 128 characters long.')

        if not re.match('^(?![.-])[a-zA-Z0-9_.-]{1,128}$', tag):
            raise errors.ParameterError(
                ('Tag {} is invalid. \n'
                 'Only characters a-z, A-Z, 0-9, ., - and _ '
                 'are allowed. \nTag can\'t start with a . or -').format(tag))

        if any(t for t in dataset.tags if t.name == tag):
            if force:
                # remove duplicate tag
                dataset.tags = [t for t in dataset.tags if t.name != tag]
            else:
                raise errors.ParameterError(
                    'Tag {} already exists'.format(tag))

        latest_commit = list(self.dataset_commits(dataset, max_results=1))[0]

        tag = DatasetTag(name=tag,
                         description=description,
                         commit=latest_commit.hexsha,
                         dataset=dataset.name)

        dataset.tags.append(tag)

        return dataset
Exemple #29
0
def tabulate(collection, columns, columns_mapping):
    """Format collection with a tabular output."""
    if not columns:
        raise errors.ParameterError('Columns cannot be empty.')

    columns = [c.lower().strip() for c in columns.split(',') if c]

    headers = _make_headers(columns, columns_mapping)

    # Sort based on the first requested field
    attr = list(headers.keys())[0]
    collection = sorted(collection, key=lambda d: getattr(d, attr))

    return tabulate_(collection, headers=headers, disable_numparse=True)
    def _get_src_and_dst(self, path, repo_path, sources, dst_root,
                         used_sources):
        is_wildcard = False

        if not sources:
            source = Path('.')
        else:
            source = None
            for s in sources.keys():
                try:
                    Path(path).relative_to(s)
                except ValueError:
                    if glob.globmatch(path, str(s), flags=glob.GLOBSTAR):
                        is_wildcard = True
                        source = path
                        used_sources.add(s)
                        break
                else:
                    source = s
                    used_sources.add(source)
                    break

            if not source:
                return

        src = repo_path / path
        source_name = Path(source).name
        relative_path = Path(path).relative_to(source)

        if src.is_dir() and is_wildcard:
            sources[source] = None
            used_sources.add(source)

        if not dst_root.exists():  # Destination will be a file or directory
            if len(sources) == 1 and not is_wildcard:
                dst = dst_root / relative_path
            else:  # Treat destination as a directory
                dst = dst_root / source_name / relative_path
        elif dst_root.is_dir():
            dst = dst_root / source_name / relative_path
        else:  # Destination is an existing file
            if src.is_dir():
                raise errors.ParameterError(
                    'Cannot copy multiple files or directories to a file')
            # Later we need to check if we are copying multiple files
            dst = dst_root

        return (path, src, dst)