def test_migrations_no_commit(isolated_runner, old_project): """Check --no-commit flag doesn't commit changes.""" client = LocalClient(path=old_project['path']) sha_before = client.repo.head.object.hexsha result = isolated_runner.invoke(cli, ['migrate', '--no-commit']) assert 0 == result.exit_code assert 'OK' in result.output assert sha_before == client.repo.head.object.hexsha
def test_migrate_project(isolated_runner, old_project): """Test migrate on old repository.""" result = isolated_runner.invoke(cli, ["migrate"]) assert 0 == result.exit_code assert not old_project.is_dirty() client = LocalClient(path=old_project.working_dir) assert client.project assert client.project.name
def test_migrations_no_commit(isolated_runner, old_project): """Check --no-commit flag doesn't commit changes.""" client = LocalClient(path=old_project.working_dir) sha_before = client.repo.head.object.hexsha result = isolated_runner.invoke(cli, ["migrate", "--no-commit"]) assert 0 == result.exit_code assert "OK" in result.output assert sha_before == client.repo.head.object.hexsha
def test_migration_broken_urls(dataset_metadata): """Check that migration of broken dataset file URLs is string.""" dataset = Dataset.from_jsonld( dataset_metadata, client=LocalClient('.'), ) for file_ in dataset.files: assert isinstance(url_to_string(file_.url), str)
def test_author_to_creator_migration(isolated_runner, old_project): """Check renaming of author to creator migration.""" result = isolated_runner.invoke(cli, ["migrate"]) assert 0 == result.exit_code client = LocalClient(path=old_project.working_dir) for dataset in client.datasets.values(): after_metadata = (Path(dataset.path) / client.METADATA).read_text() assert "creator:" in after_metadata assert "authors:" not in after_metadata
def test_correct_relative_path(isolated_runner, old_project): """Check if path on dataset has been correctly migrated.""" result = isolated_runner.invoke(cli, ["migrate"]) assert 0 == result.exit_code client = LocalClient(path=old_project.working_dir) assert client.datasets for ds in client.datasets.values(): assert not Path(ds.path).is_absolute() assert ds.path.startswith(RENKU_HOME)
def test_latest_version(project): """Test returning the latest version of `SoftwareAgent`.""" from renku import __version__ create_dataset( "ds1", title="", description="", creators=[], ) agent_version = LocalClient(project).latest_agent assert __version__ == agent_version
def test_author_to_creator_migration(isolated_runner, old_project): """Check renaming of author to creator migration.""" client = LocalClient(path=old_project['path']) if client.datasets: dataset = client.datasets.popitem()[1] dataset_path_pre40 = Path(dataset.path.replace('-', '')) if dataset_path_pre40.exists(): metadata = (dataset_path_pre40 / client.METADATA).read_text() assert 'authors:' in metadata result = isolated_runner.invoke(cli, ['migrate', 'datasets']) assert 0 == result.exit_code after_metadata = (Path(dataset.path) / client.METADATA).read_text() assert 'creator:' in after_metadata assert 'authors:' not in after_metadata
def test_correct_path_migrated(isolated_runner, old_project): """Check if path on dataset files has been correctly migrated.""" result = isolated_runner.invoke(cli, ["migrate"]) assert 0 == result.exit_code client = LocalClient(path=old_project.working_dir) assert client.datasets for ds in client.datasets.values(): for file_ in ds.files: path_ = Path(file_.path) assert path_.exists() assert not path_.is_absolute() assert file_._label assert file_._id assert file_.path in file_._label assert file_.path in file_._id
def test_latest_version_user_commits(project): """Test retrieval of `SoftwareAgent` with latest non-renku command.""" from git import Repo from renku import __version__ create_dataset( "ds1", title="", description="", creators=[], ) myfile = Path("myfile") myfile.write_text("123") repo = Repo(project) repo.index.add([str(myfile)]) repo.index.commit("added myfile") agent_version = LocalClient(project).latest_agent assert __version__ == agent_version
def update_dataset_files(self, files, ref, delete=False): """Update files and dataset metadata according to their remotes. :param files: List of files to be updated :param delete: Indicates whether to delete files or not :return: List of files that should be deleted """ from renku import LocalClient visited_repos = {} updated_files = [] deleted_files = [] for file_ in files: if not file_.based_on: continue file_.based_on = DatasetFile.from_jsonld(file_.based_on) based_on = file_.based_on url = based_on.url if url in visited_repos: repo, repo_path, remote_client = visited_repos[url] else: repo, repo_path = self.prepare_git_repo(url, ref) remote_client = LocalClient(repo_path) visited_repos[url] = repo, repo_path, remote_client remote_file = self._fetch_file_metadata(remote_client, based_on.path) if not remote_file: try: remote_file = DatasetFile.from_revision( remote_client, path=based_on.path, url=url, added=based_on.added) except KeyError: raise errors.ParameterError( 'Cannot find file {} in the repo {}'.format( based_on.url, url)) commit_sha = self._get_commit_sha_from_label(based_on) remote_commit_sha = self._get_commit_sha_from_label(remote_file) if commit_sha != remote_commit_sha: src = Path(repo.working_dir) / based_on.path dst = self.renku_path.parent / file_.path if src.exists(): # Fetch file is it is tracked by Git LFS self._fetch_lfs_files(repo_path, {based_on.path}) if remote_client._is_external_file(src): self.remove_file(dst) self._create_external_file(src.resolve(), dst) else: shutil.copy(src, dst) file_.based_on.commit = remote_file.commit file_.based_on._label = remote_file._label updated_files.append(file_) else: # File was removed or renamed if delete: self.remove_file(dst) deleted_files.append(file_) if not updated_files and (not delete or not deleted_files): # Nothing to commit or update return deleted_files # Commit changes in files file_paths = {str(f.path) for f in updated_files + deleted_files} # Force-add to include possible ignored files that are in datasets self.repo.git.add(*(file_paths), force=True) self.repo.index.commit( 'renku dataset: updated {} files and deleted {} files'.format( len(updated_files), len(deleted_files))) # Update datasets' metadata modified_datasets = {} for file_ in updated_files: # Re-create list of creators creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=file_.path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) new_file = DatasetFile.from_revision(self, path=file_.path, based_on=file_.based_on, creator=creators) file_.dataset.update_files([new_file]) modified_datasets[file_.dataset.name] = file_.dataset if delete: for file_ in deleted_files: file_.dataset.unlink_file(file_.path) modified_datasets[file_.dataset.name] = file_.dataset for dataset in modified_datasets.values(): dataset.to_yaml() return deleted_files
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError('No such file or directory', param_hint=unused_sources) if destination.exists() and not destination.is_dir(): if len(files) > 1: raise errors.ParameterError( 'Cannot copy multiple files or directories to a file') # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url, creator=creators) path_in_dst_repo = dst.relative_to(self.path) if remote_client._is_external_file(src): operation = (src.resolve(), dst, 'symlink') else: operation = (src, dst, 'copy') results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on, 'operation': operation }) return results
def test_comprehensive_dataset_migration(isolated_runner, old_dataset_project): """Test migration of old project with all dataset variations.""" result = isolated_runner.invoke(cli, ["migrate"]) assert 0 == result.exit_code assert "OK" in result.output client = LocalClient(path=old_dataset_project.working_dir) dataset = client.load_dataset("dataverse") assert dataset._id.endswith( "/datasets/1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c") assert "1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c" == dataset.identifier assert "1d2ed1e4-3aeb-4f25-90b2-38084ee3d86c" == dataset._label assert "Cornell University" == dataset.creators[0].affiliation assert "Rooth, Mats" == dataset.creators[0].name assert "Rooth, Mats" == dataset.creators[0].label assert dataset.date_published is None assert "2020-08-10T21:35:05.115412+00:00" == dataset.date_created.isoformat( "T") assert "Replication material for a paper to be presented" in dataset.description assert "https://doi.org/10.7910/DVN/EV6KLF" == dataset.same_as.url assert "1" == dataset.tags[0].name assert "Tag 1 created by renku import" == dataset.tags[0].description assert isinstance(dataset.license, dict) assert "https://creativecommons.org/publicdomain/zero/1.0/" in str( dataset.license) file_ = dataset.find_file("data/dataverse/copy.sh") assert "https://dataverse.harvard.edu/api/access/datafile/3050656" == file_.source assert file_.url.endswith( "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/dataverse/copy.sh" ) assert "2020-08-10T21:35:10.877832+00:00" == file_.added.isoformat("T") assert file_.based_on is None assert not hasattr(file_, "creators") dataset = client.load_dataset("mixed") assert "v1" == dataset.tags[0].name file_ = dataset.find_file("data/mixed/Makefile") assert file_._id.endswith( "/blob/a5f6c3700616e005ac599d24feb7a770430bd93a/data/mixed/Makefile") assert "https://github.com/SwissDataScienceCenter/renku-jupyter.git" == file_.source assert file_.source == file_.based_on.source assert file_.source == file_.based_on.url assert "Makefile@49f331d7388785208ccfb3cfb9156b226d9b59ea" == file_.based_on._label assert file_.based_on.based_on is None assert file_.url.endswith( "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/mixed/Makefile" ) file_ = dataset.find_file("data/mixed/data.txt") assert file_._id.endswith( "/blob/b32138c1bcb2b53da974bbeb842f4d621e155355/data/mixed/data.txt") assert "../../../../tmp/data.txt" == file_.source assert file_.based_on is None assert file_.url.endswith( "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/data/mixed/data.txt" ) file_ = dataset.find_file("README.md") assert file_._id.endswith( "/blob/0bfb07be3b538e6683e1d2055b5ae4d3a4c593dd/README.md") assert "README.md" == file_.source assert file_.based_on is None assert file_.url.endswith( "/projects/mohammad.alisafaee/old-datasets-v0.9.1/files/blob/README.md" )
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self._prepare_git_repo(url, ref) copied_sources = set() files = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination) if result: files.add(result) source = result[3] copied_sources.add(source) uncopied_sources = sources - copied_sources if uncopied_sources: uncopied_sources = {str(s) for s in uncopied_sources} raise errors.ParameterError('No such file or directory', param_hint=uncopied_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _, __ in files: if src.is_dir(): continue if src.is_symlink(): path = str(src.resolve().relative_to(repo_path)) paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) for path, src, dst, _ in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None creators = based_on.creator else: creators = [] # grab all the creators from the commit history for commit in repo.iter_commits(paths=path): creator = Person.from_commit(commit) if creator not in creators: creators.append(creator) based_on = DatasetFile.from_revision(remote_client, path=path, url=url) path_in_dst_repo = dst.relative_to(self.path) results.append({ 'path': path_in_dst_repo, 'url': remove_credentials(url), 'creator': creators, 'parent': self, 'based_on': based_on }) dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(src), str(dst)) return results
def _add_from_git(self, dataset, url, sources, destination, ref): """Process adding resources from another git repository.""" from renku import LocalClient u = parse.urlparse(url) sources = self._resolve_paths(u.path, sources) # Get all files from repo that match sources repo, repo_path = self.prepare_git_repo(url, ref) files = set() used_sources = set() for file in repo.head.commit.tree.traverse(): path = file.path result = self._get_src_and_dst(path, repo_path, sources, destination, used_sources) if result: files.add(result) unused_sources = set(sources.keys()) - used_sources if unused_sources: unused_sources = {str(s) for s in unused_sources} raise errors.ParameterError("No such file or directory", param_hint=unused_sources) # Create metadata and move files to dataset results = [] remote_client = LocalClient(repo_path) # Pull files from LFS paths = set() for path, src, _ in files: if src.is_dir(): continue if src.is_symlink(): try: path = str(src.resolve().relative_to(repo_path)) except ValueError: # External file pass paths.add(path) self._fetch_lfs_files(repo_path, paths) # Fetch metadata from Renku if any paths = {f[0] for f in files} metadata = self._fetch_files_metadata(remote_client, paths) new_files = [] for path, src, dst in files: if not src.is_dir(): # Use original metadata if it exists based_on = metadata.get(path) if based_on: based_on.url = url based_on.based_on = None based_on.source = url else: based_on = DatasetFile.from_revision(remote_client, path=src, url=url, source=url) path_in_dst_repo = dst.relative_to(self.path) if path_in_dst_repo in new_files: # A path with the same destination is already copied continue new_files.append(path_in_dst_repo) if remote_client._is_external_file(src): operation = (src.resolve(), dst, "symlink") else: operation = (src, dst, "copy") results.append( { "path": path_in_dst_repo, "source": remove_credentials(url), "parent": self, "based_on": based_on, "operation": operation, } ) return results