def _add_from_git(self, dataset, path, url, target, **kwargs): """Process adding resources from another git repository. The submodules are placed in ``.renku/vendors`` and linked to the *path* specified by the user. """ # create the submodule u = parse.urlparse(url) submodule_path = self.renku_path / 'vendors' / (u.netloc or 'local') # Respect the directory struture inside the source path. relative_to = kwargs.get('relative_to', None) if u.scheme in ('', 'file'): warnings.warn('Importing local git repository, use HTTPS') # determine where is the base repo path r = git.Repo(url, search_parent_directories=True) src_repo_path = Path(r.git_dir).parent submodule_name = os.path.basename(src_repo_path) submodule_path = submodule_path / str(src_repo_path).lstrip('/') # if repo path is a parent, rebase the paths and update url if src_repo_path != Path(u.path): top_target = Path( u.path ).resolve().absolute().relative_to(src_repo_path) if target: target = top_target / target else: target = top_target url = src_repo_path.as_posix() elif u.scheme in ('http', 'https'): submodule_name = os.path.splitext(os.path.basename(u.path))[0] submodule_path = submodule_path.joinpath( os.path.dirname(u.path).lstrip('/'), submodule_name ) else: raise NotImplementedError( 'Scheme {} not supported'.format(u.scheme) ) # FIXME: do a proper check that the repos are not the same if submodule_name not in (s.name for s in self.git.submodules): # new submodule to add self.git.create_submodule( name=submodule_name, path=submodule_path.as_posix(), url=url ) src = submodule_path / (target or '') if target and relative_to: relative_to = Path(relative_to) if relative_to.is_absolute(): assert u.scheme in { '', 'file' }, ('Only relative paths can be used with URLs.') target = (Path(url).resolve().absolute() / target).relative_to( relative_to.resolve() ) else: # src already includes target so we do not have to append it target = src.relative_to(submodule_path / relative_to) # link the target into the data directory dst = self.path / path / submodule_name / (target or '') # if we have a directory, recurse if src.is_dir(): files = {} dst.mkdir(parents=True, exist_ok=True) # FIXME get all files from submodule index for f in src.iterdir(): try: files.update( self._add_from_git( dataset, path, url, target=f.relative_to(submodule_path), **kwargs ) ) except ValueError: pass # skip files outside the relative path return files if not dst.parent.exists(): dst.parent.mkdir(parents=True) os.symlink(os.path.relpath(src, dst.parent), dst) # grab all the authors from the commit history git_repo = git.Repo(submodule_path.absolute().as_posix()) authors = [] for commit in git_repo.iter_commits(paths=target): author = Author.from_commit(commit) if author not in authors: authors.append(author) dataset_path = self.path / self.datadir / dataset.name result = dst.relative_to(dataset_path).as_posix() if u.scheme in ('', 'file'): url = None else: url = '{}/{}'.format(url, target) return { result: DatasetFile( path=result, url=url, authors=authors, dataset=dataset.name, # TODO detect original dataset ) }
def _add_from_git(self, dataset, path, url, target, **kwargs): """Process adding resources from another git repository. The submodules are placed in ``.renku/vendors`` and linked to the *path* specified by the user. """ from git import Repo # create the submodule if url.startswith('git@'): url = 'git+ssh://' + url u = parse.urlparse(url) submodule_path = self.renku_path / 'vendors' / (u.netloc or 'local') # Respect the directory struture inside the source path. relative_to = kwargs.get('relative_to', None) if u.scheme in ('', 'file'): try: relative_url = Path(url).resolve().relative_to(self.path) except Exception: relative_url = None if relative_url: return [{ 'path': url, 'url': url, 'creator': dataset.creator, 'dataset': dataset.name, 'parent': self }] warnings.warn('Importing local git repository, use HTTPS') # determine where is the base repo path r = Repo(url, search_parent_directories=True) src_repo_path = Path(r.git_dir).parent.resolve() submodule_name = src_repo_path.name submodule_path = submodule_path / str(src_repo_path).lstrip('/') # if repo path is a parent, rebase the paths and update url if src_repo_path != Path(u.path): top_target = Path( u.path ).resolve().absolute().relative_to(src_repo_path) if target: target = top_target / target else: target = top_target url = src_repo_path.as_posix() elif u.scheme in {'http', 'https', 'git+https', 'git+ssh'}: submodule_name = os.path.splitext(os.path.basename(u.path))[0] submodule_path = submodule_path.joinpath( os.path.dirname(u.path).lstrip('/'), submodule_name ) else: raise NotImplementedError( 'Scheme {} not supported'.format(u.scheme) ) # FIXME: do a proper check that the repos are not the same if submodule_name not in (s.name for s in self.repo.submodules): if u.scheme in {'http', 'https', 'git+https', 'git+ssh'}: url = self.get_relative_url(url) # Submodule in python git does some custom magic that does not # allow for relative URLs, so we call the git function directly self.repo.git.submodule([ 'add', '--force', '--name', submodule_name, url, submodule_path.relative_to(self.path).as_posix() ]) src = submodule_path / (target or '') if target and relative_to: relative_to = Path(relative_to) if relative_to.is_absolute(): assert u.scheme in { '', 'file' }, 'Only relative paths can be used with URLs.' target = (Path(url).resolve().absolute() / target).relative_to( relative_to.resolve() ) else: # src already includes target so we do not have to append it target = src.relative_to(submodule_path / relative_to) # link the target into the data directory dst = self.path / path / (target or '') # if we have a directory, recurse if src.is_dir(): files = [] dst.mkdir(parents=True, exist_ok=True) # FIXME get all files from submodule index for f in src.iterdir(): try: files.extend( self._add_from_git( dataset, path, url, target=f.relative_to(submodule_path), **kwargs ) ) except ValueError: pass # skip files outside the relative path return files if not dst.parent.exists(): dst.parent.mkdir(parents=True) os.symlink(os.path.relpath(str(src), str(dst.parent)), str(dst)) # grab all the creators from the commit history git_repo = Repo(str(submodule_path.absolute())) creators = [] for commit in git_repo.iter_commits(paths=target): creator = Creator.from_commit(commit) if creator not in creators: creators.append(creator) if u.scheme in ('', 'file'): url = None else: url = '{}/{}'.format(url, target) return [{ 'path': dst.relative_to(self.path), 'url': url, 'creator': creators, 'dataset': dataset.name, 'parent': self }]