def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" f = DatasetFile(path='file', creator=creators) creator = Creator(name='me', email='*****@*****.**') assert creator in f.creator # email check with pytest.raises(ValueError): Creator(name='me', email='meexample.com') # creators must be a set or list of dicts or Creator with pytest.raises(ValueError): f = DatasetFile(path='file', creator=['name'])
def as_dataset(self, client): """Deserialize `ZenodoRecordSerializer` to `Dataset`.""" files = self.get_files() metadata = self.get_jsonld() dataset = Dataset.from_jsonld(metadata, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_, id=file_.id, checksum=file_.checksum, filename=file_.filename, filesize=file_.filesize, filetype=file_.type, dataset=dataset.name, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files if isinstance(dataset.url, dict) and '_id' in dataset.url: dataset.url = urllib.parse.urlparse(dataset.url.pop('_id')) dataset.url = dataset.url.geturl() return dataset
def add_data_to_dataset( self, dataset, urls, git=False, force=False, **kwargs ): """Import the data into the data directory.""" dataset_path = self.path / self.datadir / dataset.name files = [] for url in urls: git = git or check_for_git_repo(url) target = kwargs.pop('target', None) if git: if isinstance(target, (str, NoneType)): files.extend( self._add_from_git( dataset, dataset_path, url, target, **kwargs ) ) else: for t in target: files.extend( self._add_from_git( dataset, dataset_path, url, t, **kwargs ) ) else: files.extend( self._add_from_url(dataset, dataset_path, url, **kwargs) ) ignored = self.find_ignored_paths(*(data['path'] for data in files)) or [] if ignored: if force: self.repo.git.add(*ignored, force=True) else: raise errors.IgnoredFiles(ignored) # commit all new data file_paths = {str(data['path']) for data in files if str(data['path'])} self.repo.git.add(*(file_paths - set(ignored))) self.repo.index.commit( 'renku dataset: commiting {} newly added files'. format(len(file_paths) + len(ignored)) ) # Generate the DatasetFiles dataset_files = [] for data in files: datasetfile = DatasetFile.from_revision(self, **data) # Set dataset file path relative to projects root for submodules if datasetfile.client != self: datasetfile.path = str(data['path']) dataset_files.append(datasetfile) dataset.update_files(dataset_files)
def _add_from_url(self, dataset, path, url, nocopy=False, **kwargs): """Process an add from url and return the location on disk.""" u = parse.urlparse(url) if u.scheme not in Dataset.SUPPORTED_SCHEMES: raise NotImplementedError('{} URLs are not supported'.format( u.scheme)) dst = path.joinpath(os.path.basename(url)).absolute() if u.scheme in ('', 'file'): src = Path(u.path).absolute() # if we have a directory, recurse if src.is_dir(): files = {} os.mkdir(dst) for f in src.iterdir(): files.update( self._add_from_url(dataset, dst, f.absolute().as_posix(), nocopy=nocopy)) return files if nocopy: try: os.link(src, dst) except Exception as e: raise Exception('Could not create hard link ' '- retry without nocopy.') from e else: shutil.copy(src, dst) # Do not expose local paths. src = None else: try: response = requests.get(url) dst.write_bytes(response.content) except error.HTTPError as e: # pragma nocover raise e # make the added file read-only mode = dst.stat().st_mode & 0o777 dst.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) self.track_paths_in_storage(dst.relative_to(self.path)) dataset_path = self.path / self.datadir / dataset.name result = dst.relative_to(dataset_path).as_posix() return { result: DatasetFile( path=result, url=url, authors=dataset.authors, dataset=dataset.name, ) }
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_, id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, dataset=dataset.name, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def _add_from_git(self, dataset, path, url, target, **kwargs): """Process adding resources from another git repository. The submodules are placed in ``.renku/vendors`` and linked to the *path* specified by the user. """ # create the submodule u = parse.urlparse(url) submodule_path = self.renku_path / 'vendors' / (u.netloc or 'local') # Respect the directory struture inside the source path. relative_to = kwargs.get('relative_to', None) if u.scheme in ('', 'file'): warnings.warn('Importing local git repository, use HTTPS') # determine where is the base repo path r = git.Repo(url, search_parent_directories=True) src_repo_path = Path(r.git_dir).parent submodule_name = os.path.basename(src_repo_path) submodule_path = submodule_path / str(src_repo_path).lstrip('/') # if repo path is a parent, rebase the paths and update url if src_repo_path != Path(u.path): top_target = Path( u.path ).resolve().absolute().relative_to(src_repo_path) if target: target = top_target / target else: target = top_target url = src_repo_path.as_posix() elif u.scheme in ('http', 'https'): submodule_name = os.path.splitext(os.path.basename(u.path))[0] submodule_path = submodule_path.joinpath( os.path.dirname(u.path).lstrip('/'), submodule_name ) else: raise NotImplementedError( 'Scheme {} not supported'.format(u.scheme) ) # FIXME: do a proper check that the repos are not the same if submodule_name not in (s.name for s in self.git.submodules): # new submodule to add self.git.create_submodule( name=submodule_name, path=submodule_path.as_posix(), url=url ) src = submodule_path / (target or '') if target and relative_to: relative_to = Path(relative_to) if relative_to.is_absolute(): assert u.scheme in { '', 'file' }, ('Only relative paths can be used with URLs.') target = (Path(url).resolve().absolute() / target).relative_to( relative_to.resolve() ) else: # src already includes target so we do not have to append it target = src.relative_to(submodule_path / relative_to) # link the target into the data directory dst = self.path / path / submodule_name / (target or '') # if we have a directory, recurse if src.is_dir(): files = {} dst.mkdir(parents=True, exist_ok=True) # FIXME get all files from submodule index for f in src.iterdir(): try: files.update( self._add_from_git( dataset, path, url, target=f.relative_to(submodule_path), **kwargs ) ) except ValueError: pass # skip files outside the relative path return files if not dst.parent.exists(): dst.parent.mkdir(parents=True) os.symlink(os.path.relpath(src, dst.parent), dst) # grab all the authors from the commit history git_repo = git.Repo(submodule_path.absolute().as_posix()) authors = [] for commit in git_repo.iter_commits(paths=target): author = Author.from_commit(commit) if author not in authors: authors.append(author) dataset_path = self.path / self.datadir / dataset.name result = dst.relative_to(dataset_path).as_posix() if u.scheme in ('', 'file'): url = None else: url = '{}/{}'.format(url, target) return { result: DatasetFile( path=result, url=url, authors=authors, dataset=dataset.name, # TODO detect original dataset ) }