def _query_knowledge_graph(url): try: response = requests.get(url) except urllib.error.HTTPError as e: raise errors.OperationError("Cannot access knowledge graph: {}".format(url)) from e if response.status_code != 200: raise errors.OperationError( "Cannot access knowledge graph: {}\nResponse code: {}".format(url, response.status_code) ) return response.json()
def _add_from_url(self, dataset, url, destination, extract, progress=None): """Process an add from url and return the location on disk.""" if destination.exists() and destination.is_dir(): u = parse.urlparse(url) destination = destination / Path(u.path).name try: paths = _download(url=url, download_to=destination, extract=extract, progress_class=progress) except error.HTTPError as e: # pragma nocover raise errors.OperationError( 'Cannot download from {}'.format(url)) from e # make the added file read-only for path in paths: mode = path.stat().st_mode & 0o777 path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) return [{ 'path': path.relative_to(self.path), 'url': remove_credentials(url), 'creator': dataset.creator, 'parent': self } for path in paths]
def _add_from_url(self, dataset, url, destination, extract, filename=None, progress=None): """Process adding from url and return the location on disk.""" url = self._provider_check(url) try: start = time.time() * 1e3 tmp_root, paths = self._download(url=url, filename=filename, extract=extract, progress_class=progress) exec_time = (time.time() * 1e3 - start) // 1e3 # If execution time was less or equal to zero seconds, # block the thread a bit to avoid being rate limited. if exec_time == 0: time.sleep(min(os.cpu_count() - 1, 4) or 1) except (requests.exceptions.HTTPError, error.HTTPError) as e: # pragma nocover raise errors.OperationError("Cannot download from {}".format(url)) from e paths = [(src, destination / src.relative_to(tmp_root)) for src in paths if not src.is_dir()] return [ { "operation": (src, dst, "move"), "path": dst.relative_to(self.path), "source": remove_credentials(url), "parent": self, } for src, dst in paths ]
def _create_external_file(self, src, dst): """Create a new external file.""" try: pointer_file = self._create_pointer_file(target=src) relative = os.path.relpath(pointer_file, dst.parent) os.symlink(relative, dst) except OSError as e: raise errors.OperationError("Could not create symbolic link") from e
def _add_from_url(self, dataset, url, destination, extract, progress=None): """Process adding from url and return the location on disk.""" if destination.exists() and destination.is_dir(): u = parse.urlparse(url) destination = destination / Path(u.path).name else: destination.parent.mkdir(parents=True, exist_ok=True) url = self.provider_check(url) try: start = time.time() * 1e+3 paths = _download(url=url, download_to=destination, extract=extract, progress_class=progress) exec_time = (time.time() * 1e+3 - start) // 1e+3 # If execution time was less or equal to zero seconds, # block the thread a bit to avoid being rate limited. if exec_time == 0: time.sleep(min(os.cpu_count() - 1, 4) or 1) except (requests.exceptions.HTTPError, error.HTTPError) as e: # pragma nocover raise errors.OperationError( 'Cannot download from {}'.format(url)) from e # make the added file read-only for path in paths: mode = path.stat().st_mode & 0o777 path.chmod(mode & ~(stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) return [{ 'path': path.relative_to(self.path), 'url': remove_credentials(url), 'creator': dataset.creator, 'parent': self } for path in paths]
def add_data_to_dataset(self, dataset, urls, force=False, sources=(), destination='', ref=None, link=False, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" warning_message = '' dataset_path = self.path / self.datadir / dataset.short_name destination = destination or Path('.') destination = self._resolve_path(dataset_path, destination) destination = self.path / dataset_path / destination files = [] if all_at_once: # only for URLs files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset, url, sources, destination, ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_message = 'Adding data from local Git ' \ 'repository. Use remote\'s Git URL instead ' \ 'to enable lineage information and updates.' u = parse.urlparse(url) new_files = self._add_from_local( dataset, u.path, link, external, destination) else: # Remote URL new_files = self._add_from_url(dataset, url, destination, extract, progress=progress) files.extend(new_files) files_to_commit = {f['path'] for f in files if f['path']} ignored = self.find_ignored_paths(*files_to_commit) if not force: if ignored: raise errors.IgnoredFiles(ignored) if dataset.contains_any(files): raise errors.DatasetFileExists() # all files at this point can be force-added and overwritten for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'link': try: os.link(src, dst) except Exception as e: raise errors.OperationError( 'Could not create hard link. Retry without "--link."' ) from e elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True # Track non-symlinks in LFS self.track_paths_in_storage(*files_to_commit) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) self.repo.index.commit(msg) # Generate the DatasetFiles dataset_files = [] for data in files: if os.path.basename(str(data['path'])) == '.git': continue dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_message
def add_data_to_dataset(self, dataset, urls, force=False, overwrite=False, sources=(), destination='', ref=None, external=False, extract=False, all_at_once=False, destination_names=None, progress=None): """Import the data into the data directory.""" messages = [] warning_messages = [] dataset_datadir = self.path / dataset.data_dir destination = destination or Path('.') destination = self._resolve_path(dataset_datadir, destination) destination = self.path / dataset_datadir / destination if destination.exists() and not destination.is_dir(): raise errors.ParameterError( f'Destination is not a directory: "{destination}"') self.check_external_storage() files = [] if all_at_once: # Importing a dataset files = self._add_from_urls(dataset=dataset, urls=urls, destination_names=destination_names, destination=destination, extract=extract, progress=progress) else: for url in urls: is_remote, is_git = _check_url(url) if is_git and is_remote: # Remote git repo sources = sources or () new_files = self._add_from_git(dataset=dataset, url=url, sources=sources, destination=destination, ref=ref) else: if sources: raise errors.UsageError( 'Cannot use "--source" with URLs or local files.') if not is_remote: # Local path, might be git if is_git: warning_messages.append( 'Adding data from local Git repository: ' + 'Use remote\'s Git URL instead to enable ' + 'lineage information and updates.') u = parse.urlparse(url) new_files = self._add_from_local( dataset=dataset, path=u.path, external=external, destination=destination) else: # Remote URL new_files = self._add_from_url(dataset=dataset, url=url, destination=destination, extract=extract, progress=progress) files.extend(new_files) # Remove all files that are under a .git directory paths_to_avoid = [ f['path'] for f in files if '.git' in str(f['path']).split(os.path.sep) ] if paths_to_avoid: files = [f for f in files if f['path'] not in paths_to_avoid] warning_messages.append( 'Ignored adding paths under a .git directory:\n ' + '\n '.join(str(p) for p in paths_to_avoid)) files_to_commit = {str(self.path / f['path']) for f in files} if not force: ignored_files = self.find_ignored_paths(*files_to_commit) if ignored_files: ignored_files = set(ignored_files) files_to_commit = files_to_commit.difference(ignored_files) ignored_sources = [] for file_ in files: if str(self.path / file_['path']) in ignored_files: operation = file_.get('operation') if operation: src, _, _ = operation ignored_sources.append(src) else: ignored_sources.append(file_['path']) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'Theses paths are ignored by one of your .gitignore ' + 'files (use "--force" flag if you really want to add ' + 'them):\n ' + '\n '.join([str(p) for p in ignored_sources])) # all files at this point can be force-added if not overwrite: existing_files = dataset.find_files(files_to_commit) if existing_files: files_to_commit = files_to_commit.difference(existing_files) files = [ f for f in files if str(self.path / f['path']) in files_to_commit ] warning_messages.append( 'These existing files were not overwritten ' + '(use "--overwrite" flag to overwrite them):\n ' + '\n '.join([str(p) for p in existing_files])) for data in files: operation = data.pop('operation', None) if not operation: continue src, dst, action = operation # Remove existing file if any self.remove_file(dst) dst.parent.mkdir(parents=True, exist_ok=True) if action == 'copy': shutil.copy(src, dst) elif action == 'move': shutil.move(src, dst, copy_function=shutil.copy) elif action == 'symlink': self._create_external_file(src, dst) data['external'] = True else: raise errors.OperationError(f'Invalid action {action}') # Track non-symlinks in LFS if self.check_external_storage(): lfs_paths = self.track_paths_in_storage(*files_to_commit) show_message = self.get_value('renku', 'show_lfs_message') if (lfs_paths and (show_message is None or show_message == 'True')): messages.append( ('Adding these files to Git LFS:\n' + '\t{}'.format('\n\t'.join(lfs_paths)) + '\nTo disable this message in the future, run:' + '\n\trenku config show_lfs_message False')) # Force-add to include possible ignored files self.repo.git.add(*files_to_commit, force=True) self.repo.git.add(self.renku_pointers_path, force=True) staged_files = self.repo.index.diff('HEAD') if staged_files: msg = 'renku dataset: committing {} newly added files'.format( len(files_to_commit)) skip_hooks = not self.external_storage_requested self.repo.index.commit(msg, skip_hooks=skip_hooks) else: warning_messages.append('No file was added to project') # Generate the DatasetFiles dataset_files = [] for data in files: dataset_file = DatasetFile.from_revision(self, **data) # Set dataset file path relative to root for submodules. if dataset_file.client != self: dataset_file.path = str(data['path']) dataset_files.append(dataset_file) dataset.update_files(dataset_files) return warning_messages, messages
def _add_from_local(self, dataset, path, link, destination): """Add a file or directory from local filesystem.""" src = Path(path).resolve() if not src.exists(): raise errors.ParameterError( 'Cannot find file/directory: {}'.format(path)) if destination.exists() and destination.is_dir(): destination = destination / src.name # if we have a directory, recurse if src.is_dir(): if destination.exists() and not destination.is_dir(): raise errors.ParameterError('Cannot copy directory to a file') if src.name == '.git': # Cannot have a '.git' directory inside a Git repo return [] files = [] destination.mkdir(parents=True, exist_ok=True) for f in src.iterdir(): files.extend( self._add_from_local(dataset, f.absolute().as_posix(), link=link, destination=destination)) return files else: # Check if file is in the project and return it try: path_in_repo = src.relative_to(self.path) except ValueError: pass else: return [{ 'path': path_in_repo, 'url': path_in_repo, 'creator': dataset.creator, 'parent': self }] # Make sure the parent directory exists. destination.parent.mkdir(parents=True, exist_ok=True) if link: try: os.link(str(src), str(destination)) except Exception as e: raise errors.OperationError('Could not create hard link ' '- retry without --link.') from e else: shutil.copy(str(src), str(destination)) return [{ 'path': destination.relative_to(self.path), 'url': 'file://' + os.path.relpath(str(src), str(self.path)), 'creator': dataset.creator, 'parent': self }]