Beispiel #1
0
def test_creator_parse(creators, data_file):
    """Test that different options for specifying creators work."""
    f = DatasetFile(path='file', creator=creators)
    creator = Creator(name='me', email='*****@*****.**')
    assert creator in f.creator

    # email check
    with pytest.raises(ValueError):
        Creator(name='me', email='meexample.com')

    # creators must be a set or list of dicts or Creator
    with pytest.raises(ValueError):
        f = DatasetFile(path='file', creator=['name'])
Beispiel #2
0
    def as_dataset(self, client):
        """Deserialize `ZenodoRecordSerializer` to `Dataset`."""
        files = self.get_files()
        metadata = self.get_jsonld()
        dataset = Dataset.from_jsonld(metadata, client=client)

        serialized_files = []
        for file_ in files:
            remote_ = file_.remote_url
            dataset_file = DatasetFile(
                url=remote_,
                id=file_.id,
                checksum=file_.checksum,
                filename=file_.filename,
                filesize=file_.filesize,
                filetype=file_.type,
                dataset=dataset.name,
                path='',
            )
            serialized_files.append(dataset_file)

        dataset.files = serialized_files

        if isinstance(dataset.url, dict) and '_id' in dataset.url:
            dataset.url = urllib.parse.urlparse(dataset.url.pop('_id'))
            dataset.url = dataset.url.geturl()

        return dataset
Beispiel #3
0
    def add_data_to_dataset(
        self, dataset, urls, git=False, force=False, **kwargs
    ):
        """Import the data into the data directory."""
        dataset_path = self.path / self.datadir / dataset.name

        files = []

        for url in urls:
            git = git or check_for_git_repo(url)

            target = kwargs.pop('target', None)

            if git:
                if isinstance(target, (str, NoneType)):
                    files.extend(
                        self._add_from_git(
                            dataset, dataset_path, url, target, **kwargs
                        )
                    )
                else:
                    for t in target:
                        files.extend(
                            self._add_from_git(
                                dataset, dataset_path, url, t, **kwargs
                            )
                        )
            else:
                files.extend(
                    self._add_from_url(dataset, dataset_path, url, **kwargs)
                )

        ignored = self.find_ignored_paths(*(data['path']
                                            for data in files)) or []

        if ignored:
            if force:
                self.repo.git.add(*ignored, force=True)
            else:
                raise errors.IgnoredFiles(ignored)

        # commit all new data
        file_paths = {str(data['path']) for data in files if str(data['path'])}
        self.repo.git.add(*(file_paths - set(ignored)))
        self.repo.index.commit(
            'renku dataset: commiting {} newly added files'.
            format(len(file_paths) + len(ignored))
        )

        # Generate the DatasetFiles
        dataset_files = []
        for data in files:
            datasetfile = DatasetFile.from_revision(self, **data)

            # Set dataset file path relative to projects root for submodules
            if datasetfile.client != self:
                datasetfile.path = str(data['path'])
            dataset_files.append(datasetfile)
        dataset.update_files(dataset_files)
Beispiel #4
0
    def _add_from_url(self, dataset, path, url, nocopy=False, **kwargs):
        """Process an add from url and return the location on disk."""
        u = parse.urlparse(url)

        if u.scheme not in Dataset.SUPPORTED_SCHEMES:
            raise NotImplementedError('{} URLs are not supported'.format(
                u.scheme))

        dst = path.joinpath(os.path.basename(url)).absolute()

        if u.scheme in ('', 'file'):
            src = Path(u.path).absolute()

            # if we have a directory, recurse
            if src.is_dir():
                files = {}
                os.mkdir(dst)
                for f in src.iterdir():
                    files.update(
                        self._add_from_url(dataset,
                                           dst,
                                           f.absolute().as_posix(),
                                           nocopy=nocopy))
                return files
            if nocopy:
                try:
                    os.link(src, dst)
                except Exception as e:
                    raise Exception('Could not create hard link '
                                    '- retry without nocopy.') from e
            else:
                shutil.copy(src, dst)

            # Do not expose local paths.
            src = None
        else:
            try:
                response = requests.get(url)
                dst.write_bytes(response.content)
            except error.HTTPError as e:  # pragma nocover
                raise e

        # make the added file read-only
        mode = dst.stat().st_mode & 0o777
        dst.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH))

        self.track_paths_in_storage(dst.relative_to(self.path))
        dataset_path = self.path / self.datadir / dataset.name
        result = dst.relative_to(dataset_path).as_posix()
        return {
            result:
            DatasetFile(
                path=result,
                url=url,
                authors=dataset.authors,
                dataset=dataset.name,
            )
        }
Beispiel #5
0
    def as_dataset(self, client):
        """Deserialize `DataverseRecordSerializer` to `Dataset`."""
        files = self.get_files()
        dataset = Dataset.from_jsonld(self._json, client=client)

        serialized_files = []
        for file_ in files:
            remote_ = file_.remote_url
            dataset_file = DatasetFile(
                url=remote_,
                id=file_._id if file_._id else file_.name,
                filename=file_.name,
                filesize=file_.content_size,
                filetype=file_.file_format,
                dataset=dataset.name,
                path='',
            )
            serialized_files.append(dataset_file)

        dataset.files = serialized_files

        return dataset
Beispiel #6
0
    def _add_from_git(self, dataset, path, url, target, **kwargs):
        """Process adding resources from another git repository.

        The submodules are placed in ``.renku/vendors`` and linked
        to the *path* specified by the user.
        """
        # create the submodule
        u = parse.urlparse(url)
        submodule_path = self.renku_path / 'vendors' / (u.netloc or 'local')

        # Respect the directory struture inside the source path.
        relative_to = kwargs.get('relative_to', None)

        if u.scheme in ('', 'file'):
            warnings.warn('Importing local git repository, use HTTPS')
            # determine where is the base repo path
            r = git.Repo(url, search_parent_directories=True)
            src_repo_path = Path(r.git_dir).parent
            submodule_name = os.path.basename(src_repo_path)
            submodule_path = submodule_path / str(src_repo_path).lstrip('/')

            # if repo path is a parent, rebase the paths and update url
            if src_repo_path != Path(u.path):
                top_target = Path(
                    u.path
                ).resolve().absolute().relative_to(src_repo_path)
                if target:
                    target = top_target / target
                else:
                    target = top_target
                url = src_repo_path.as_posix()
        elif u.scheme in ('http', 'https'):
            submodule_name = os.path.splitext(os.path.basename(u.path))[0]
            submodule_path = submodule_path.joinpath(
                os.path.dirname(u.path).lstrip('/'), submodule_name
            )
        else:
            raise NotImplementedError(
                'Scheme {} not supported'.format(u.scheme)
            )

        # FIXME: do a proper check that the repos are not the same
        if submodule_name not in (s.name for s in self.git.submodules):
            # new submodule to add
            self.git.create_submodule(
                name=submodule_name, path=submodule_path.as_posix(), url=url
            )

        src = submodule_path / (target or '')

        if target and relative_to:
            relative_to = Path(relative_to)
            if relative_to.is_absolute():
                assert u.scheme in {
                    '', 'file'
                }, ('Only relative paths can be used with URLs.')
                target = (Path(url).resolve().absolute() / target).relative_to(
                    relative_to.resolve()
                )
            else:
                # src already includes target so we do not have to append it
                target = src.relative_to(submodule_path / relative_to)

        # link the target into the data directory
        dst = self.path / path / submodule_name / (target or '')

        # if we have a directory, recurse
        if src.is_dir():
            files = {}
            dst.mkdir(parents=True, exist_ok=True)
            # FIXME get all files from submodule index
            for f in src.iterdir():
                try:
                    files.update(
                        self._add_from_git(
                            dataset,
                            path,
                            url,
                            target=f.relative_to(submodule_path),
                            **kwargs
                        )
                    )
                except ValueError:
                    pass  # skip files outside the relative path
            return files

        if not dst.parent.exists():
            dst.parent.mkdir(parents=True)

        os.symlink(os.path.relpath(src, dst.parent), dst)

        # grab all the authors from the commit history
        git_repo = git.Repo(submodule_path.absolute().as_posix())
        authors = []
        for commit in git_repo.iter_commits(paths=target):
            author = Author.from_commit(commit)
            if author not in authors:
                authors.append(author)

        dataset_path = self.path / self.datadir / dataset.name
        result = dst.relative_to(dataset_path).as_posix()

        if u.scheme in ('', 'file'):
            url = None
        else:
            url = '{}/{}'.format(url, target)

        return {
            result:
                DatasetFile(
                    path=result,
                    url=url,
                    authors=authors,
                    dataset=dataset.name,  # TODO detect original dataset
                )
        }