def test_creator_parse(creators, data_file): """Test that different options for specifying creators work.""" f = DatasetFile(path='file', creator=creators) creator = Person(name='me', email='*****@*****.**') assert creator in f.creator # email check with pytest.raises(ValueError): Person(name='me', email='meexample.com') # creators must be a set or list of dicts or Person with pytest.raises(ValueError): f = DatasetFile(path='file', creator=['name'])
def as_dataset(self, client): """Deserialize `ZenodoRecordSerializer` to `Dataset`.""" files = self.get_files() metadata = self.get_jsonld() dataset = Dataset.from_jsonld(metadata, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_.id, checksum=file_.checksum, filename=file_.filename, filesize=file_.filesize, filetype=file_.type, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files if isinstance(dataset.url, dict) and '_id' in dataset.url: dataset.url = urllib.parse.urlparse(dataset.url.pop('_id')) dataset.url = dataset.url.geturl() return dataset
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client, schema_class=_DataverseDatasetSchema) if dataset.description and not dataset.description.strip(): dataset.description = None for creator in dataset.creator: if creator.affiliation == '': creator.affiliation = None serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_.geturl(), id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def update_dataset_files(self, files, ref, delete=False): """Update files and dataset metadata according to their remotes. :param files: List of files to be updated :param delete: Indicates whether to delete files or not :return: List of files that should be deleted """ from renku import LocalClient visited_repos = {} updated_files = [] deleted_files = [] for file_ in files: if not file_.based_on: continue file_.based_on = DatasetFile.from_jsonld(file_.based_on) based_on = file_.based_on url = based_on.url if url in visited_repos: repo, repo_path, remote_client = visited_repos[url] else: repo, repo_path = self.prepare_git_repo(url, ref) remote_client = LocalClient(repo_path) visited_repos[url] = repo, repo_path, remote_client remote_file = self._fetch_file_metadata(remote_client, based_on.path) if not remote_file: try: remote_file = DatasetFile.from_revision( remote_client, path=based_on.path, url=url, added=based_on.added) except KeyError: raise errors.ParameterError( 'Cannot find file {} in the repo {}'.format( based_on.url, url)) commit_sha = self._get_commit_sha_from_label(based_on) remote_commit_sha = self._get_commit_sha_from_label(remote_file) if commit_sha != remote_commit_sha: src = Path(repo.working_dir) / based_on.path dst = self.renku_path.parent / file_.path if src.exists(): # Fetch file is it is tracked by Git LFS self._fetch_lfs_files(repo_path, {based_on.path}) if remote_client._is_external_file(src): self.remove_file(dst) self._create_external_file(src.resolve(), dst) else: shutil.copy(src, dst) file_.based_on.commit = remote_file.commit file_.based_on._label = remote_file._label updated_files.append(file_) else: # File was removed or renamed if delete: self.remove_file(dst) deleted_files.append(file_) if not updated_files and (not delete or not deleted_files): # Nothing to commit or update return deleted_files # Commit changes in files file_paths = {str(f.path) for f in updated_files + deleted_files} # Force-add to include possible ignored files that are in datasets self.repo.git.add(*(file_paths), force=True) self.repo.index.commit( 'renku dataset: updated {} files and deleted {} files'.format( len(updated_files), len(deleted_files))) # Update datasets' metadata modified_datasets = {} for file_ in updated_files: # Re-create list of creators creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=file_.path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) new_file = DatasetFile.from_revision(self, path=file_.path, based_on=file_.based_on, creator=creators) file_.dataset.update_files([new_file]) modified_datasets[file_.dataset.name] = file_.dataset if delete: for file_ in deleted_files: file_.dataset.unlink_file(file_.path) modified_datasets[file_.dataset.name] = file_.dataset for dataset in modified_datasets.values(): dataset.to_yaml() return deleted_files
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError('No such file or directory', param_hint=unused_sources) if destination.exists() and not destination.is_dir(): if len(files) > 1: raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url, creator=creators) path_in_dst_repo = dst.relative_to(self.path) if remote_client._is_external_file(src): operation = (src.resolve(), dst, 'symlink') else: operation = (src, dst, 'copy') results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on, 'operation': operation }) return results
def add_data_to_dataset(self, dataset, urls, force=False, sources=(), destination='', ref=None, link=False, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" warning_message = '' dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) destination = self.path / dataset_path / destination files = [] if all_at_once: # only for URLs files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset, url, sources, destination, ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_message = 'Adding data from local Git ' \ 'repository. Use remote\'s Git URL instead ' \ 'to enable lineage information and updates.' u = parse.urlparse(url) new_files = self._add_from_local( dataset, u.path, link, external, destination) else: # Remote URL new_files = self._add_from_url(dataset, url, destination, extract, progress=progress) files.extend(new_files) files_to_commit = {f['path'] for f in files if f['path']} ignored = self.find_ignored_paths(*files_to_commit) if not force: if ignored: raise errors.IgnoredFiles(ignored) if dataset.contains_any(files): raise errors.DatasetFileExists() # all files at this point can be force-added and overwritten for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'link': try: os.link(src, dst) except Exception as e: raise errors.OperationError( 'Could not create hard link. Retry without "--link."' ) from e elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True # Track non-symlinks in LFS self.track_paths_in_storage(*files_to_commit) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) self.repo.index.commit(msg) # Generate the DatasetFiles dataset_files = [] for data in files: if os.path.basename(str(data['path'])) == '.git': continue dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_message
def _migrate_submodule_based_datasets(client): from renku.core.management import LocalClient from renku.core.management.migrate import is_project_unsupported submodules = client.repo.submodules if not submodules: return for s in submodules: try: s.update() except GitError: pass submodules_urls = {s.path: s.url for s in submodules} repo_paths = [] symlinks = [] for dataset in client.datasets.values(): for file_ in dataset.files: path = client.path / file_.path if not path.is_symlink(): continue target = path.resolve() if '/.renku/vendors/' not in str(target): continue repo = Repo(target.parent, search_parent_directories=True) repo_path = repo.working_dir if repo_path not in repo_paths: repo_paths.append(repo_path) symlinks.append((file_.path, target, repo_path)) if not symlinks: return remote_clients = {p: LocalClient(p) for p in repo_paths} for remote_client in remote_clients.values(): if not is_project_unsupported(remote_client): migrate(remote_client) metadata = {} for path, target, repo_path in symlinks: remote_client = remote_clients[repo_path] path_within_repo = target.relative_to(repo_path) repo_is_remote = '.renku/vendors/local' not in repo_path based_on = None submodule_path = Path(repo_path).relative_to(client.path) url = submodules_urls.get(str(submodule_path), '') if repo_is_remote: based_on = _fetch_file_metadata(remote_client, path_within_repo) if based_on: based_on.url = url based_on.based_on = None else: based_on = DatasetFile.from_revision(remote_client, path=path_within_repo, url=url) else: if url: full_path = Path(url) / path_within_repo rel_path = os.path.relpath(full_path, client.path) url = f'file://{rel_path}' metadata[path] = (based_on, url) path = client.path / path path.unlink() try: shutil.move(target, path) except FileNotFoundError: raise errors.InvalidFileOperation(f'File was not found: {target}') for s in submodules: if s.path.startswith('.renku/vendors/'): try: s.remove(force=True) except ValueError: pass for dataset in client.datasets.values(): for file_ in dataset.files: if file_.path in metadata: based_on, url = metadata[file_.path] file_.based_on = based_on file_.url = remove_credentials(url) dataset.to_yaml()
def add_data_to_dataset(self, dataset, urls, force=False, overwrite=False, sources=(), destination='', ref=None, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" messages = [] warning_messages = [] dataset_datadir = self.path / dataset.data_dir destination = destination or Path('.') destination = self._resolve_path(dataset_datadir, destination) destination = self.path / dataset_datadir / destination if destination.exists() and not destination.is_dir(): raise errors.ParameterError( f'Destination is not a directory: "{destination}"') self.check_external_storage() files = [] if all_at_once: # Importing a dataset files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset=dataset, url=url, sources=sources, destination=destination, ref=ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_messages.append( 'Adding data from local Git repository: ' + 'Use remote\'s Git URL instead to enable ' + 'lineage information and updates.') u = parse.urlparse(url) new_files = self._add_from_local( dataset=dataset, path=u.path, external=external, destination=destination) else: # Remote URL new_files = self._add_from_url(dataset=dataset, url=url, destination=destination, extract=extract, progress=progress) files.extend(new_files) # Remove all files that are under a .git directory paths_to_avoid = [ f['path'] for f in files if '.git' in str(f['path']).split(os.path.sep) ] if paths_to_avoid: files = [f for f in files if f['path'] not in paths_to_avoid] warning_messages.append( 'Ignored adding paths under a .git directory:\n ' + '\n '.join(str(p) for p in paths_to_avoid)) files_to_commit = {str(self.path / f['path']) for f in files} if not force: ignored_files = self.find_ignored_paths(*files_to_commit) if ignored_files: ignored_files = set(ignored_files) files_to_commit = files_to_commit.difference(ignored_files) ignored_sources = [] for file_ in files: if str(self.path / file_['path']) in ignored_files: operation = file_.get('operation') if operation: src, _, _ = operation ignored_sources.append(src) else: ignored_sources.append(file_['path']) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'Theses paths are ignored by one of your .gitignore ' + 'files (use "--force" flag if you really want to add ' + 'them):\n ' + '\n '.join([str(p) for p in ignored_sources])) # all files at this point can be force-added if not overwrite: existing_files = dataset.find_files(files_to_commit) if existing_files: files_to_commit = files_to_commit.difference(existing_files) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'These existing files were not overwritten ' + '(use "--overwrite" flag to overwrite them):\n ' + '\n '.join([str(p) for p in existing_files])) for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'move': shutil.move(src, dst, copy_function=shutil.copy) elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True else: raise errors.OperationError(f'Invalid action {action}') # Track non-symlinks in LFS if self.check_external_storage(): lfs_paths = self.track_paths_in_storage(*files_to_commit) show_message = self.get_value('renku', 'show_lfs_message') if (lfs_paths and (show_message is None or show_message == 'True')): messages.append( ('Adding these files to Git LFS:\n' + '\t{}'.format('\n\t'.join(lfs_paths)) + '\nTo disable this message in the future, run:' + '\n\trenku config show_lfs_message False')) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) skip_hooks = not self.external_storage_requested self.repo.index.commit(msg, skip_hooks=skip_hooks) else: warning_messages.append('No file was added to project') # Generate the DatasetFiles dataset_files = [] for data in files: dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_messages, messages
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self._prepare_git_repo(url, ref) copied_sources = set() files = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination) if result: files.add(result) source = result[3] copied_sources.add(source) uncopied_sources = sources - copied_sources if uncopied_sources: uncopied_sources = {str(s) for s in uncopied_sources} raise errors.ParameterError('No such file or directory', param_hint=uncopied_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): continue if src.is_symlink(): path = str(src.resolve().relative_to(repo_path)) paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst, _ in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url) path_in_dst_repo = dst.relative_to(self.path) results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on }) dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(src), str(dst)) return results
def add_data_to_dataset(self, dataset, urls, force=False, sources=(), destination='', ref=None, link=False, extract=False, all_at_once=False, progress=None): """Import the data into the data directory.""" warning_message = '' dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) destination = self.path / dataset_path / destination files = [] if all_at_once: # only for URLs files = self._add_from_urls(dataset=dataset, urls=urls, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset, url, sources, destination, ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_message = 'Adding data from local Git ' \ 'repository. Use remote\'s Git URL instead ' \ 'to enable lineage information and updates.' u = parse.urlparse(url) new_files = self._add_from_local( dataset, u.path, link, destination) else: # Remote URL new_files = self._add_from_url(dataset, url, destination, extract) files.extend(new_files) self.track_paths_in_storage(*(f['path'] for f in files)) ignored = self.find_ignored_paths(*(data['path'] for data in files)) or [] if ignored: if force: self.repo.git.add(*ignored, force=True) else: raise errors.IgnoredFiles(ignored) if dataset.contains_any(files) and force is False: raise errors.DatasetFileExists() # commit all new data file_paths = {str(data['path']) for data in files if str(data['path'])} files_to_add = (file_paths - set(ignored)) self.repo.git.add(*files_to_add) if self.repo.is_dirty(): commit_msg = ('renku dataset: ' 'committing {} newly added files' ).format(len(file_paths) + len(ignored)) self.repo.index.commit(commit_msg) # Generate the DatasetFiles dataset_files = [] for data in files: if os.path.basename(str(data['path'])) == '.git': continue dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_message
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError("No such file or directory", param_hint=unused_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) new_files = [] for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None based_on.source = url else: based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url) path_in_dst_repo = dst.relative_to(self.path) if path_in_dst_repo in new_files: # A path with the same destination is already copied continue new_files.append(path_in_dst_repo) if remote_client._is_external_file(src): operation = (src.resolve(), dst, "symlink") else: operation = (src, dst, "copy") results.append( { "path": path_in_dst_repo, "source": remove_credentials(url), "parent": self, "based_on": based_on, "operation": operation, } ) return results