def diff(repo: Repository, dev, master): """Display diff of DEV commit/branch to MASTER commit/branch. If no MASTER is specified, then the staging area branch HEAD will will be used as the commit digest for MASTER. This operation will return a diff which could be interpreted as if you were merging the changes in DEV into MASTER. TODO: VERIFY ORDER OF OUTPUT IS CORRECT. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.commiting import get_staging_branch_head from hangar.records.summarize import status if dev not in repo.list_branches(): dev = expand_short_commit_digest(repo._env.refenv, dev) if master is None: master = get_staging_branch_head(repo._env.branchenv) elif master not in repo.list_branches(): master = expand_short_commit_digest(repo._env.refenv, master) diff_spec = repo.diff(master, dev) buf = status(hashenv=repo._env.hashenv, branch_name=dev, diff=diff_spec.diff) click.echo(buf.getvalue())
def test_branch_create_and_delete(written_two_cmt_server_repo): server, base_repo = written_two_cmt_server_repo co = base_repo.checkout(write=True) cmt = co.commit_hash co.close() runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() new_repo = Repository(P, exists=False) try: res = runner.invoke(cli.clone, [ '--name', 'Foo Tester', '--email', '*****@*****.**', f'{server}' ], obj=new_repo) assert res.exit_code == 0 res = runner.invoke(cli.branch_create, ['testbranch'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Created BRANCH: testbranch HEAD: {cmt}\n" branches = new_repo.list_branches() assert branches == ['master', 'origin/master', 'testbranch'] res = runner.invoke(cli.branch_remove, ['testbranch'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Deleted BRANCH: testbranch HEAD: {cmt}\n" branches = new_repo.list_branches() assert branches == ['master', 'origin/master'] new_repo.create_branch('secondtest') co = new_repo.checkout(write=True, branch='secondtest') co.add_str_column('test_meta') newDigest = co.commit('dummy commit') co.close() # re-open with staging set to master so we can try to delete secondtest co = new_repo.checkout(write=True, branch='master') co.close() res = runner.invoke(cli.branch_remove, ['secondtest'], obj=new_repo) assert res.exit_code == 1 res = runner.invoke(cli.branch_remove, ['secondtest', '-f'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Deleted BRANCH: secondtest HEAD: {newDigest}\n" res = runner.invoke(cli.branch_list, obj=new_repo) assert res.exit_code == 0 assert res.stdout == "['master', 'origin/master']\n" finally: new_repo._env._close_environments()
def test_cannot_operate_without_repo_init(managed_tmpdir): repo = Repository(path=managed_tmpdir, exists=False) with pytest.raises(RuntimeError): repo.writer_lock_held() with pytest.raises(RuntimeError): repo.checkout() with pytest.raises(RuntimeError): repo.writer_lock_held() with pytest.raises(RuntimeError): repo.log() with pytest.raises(RuntimeError): repo.summary() with pytest.raises(RuntimeError): repo.merge('fail', 'master', 'nonexistant') with pytest.raises(RuntimeError): repo.create_branch('test') with pytest.raises(RuntimeError): repo.list_branches() with pytest.raises(RuntimeError): repo.force_release_writer_lock() with pytest.raises(RuntimeError): repo.remote.add('origin', 'foo') with pytest.raises(RuntimeError): repo.remote.remove('origin') with pytest.raises(RuntimeError): repo.remote.fetch('origin', 'master') with pytest.raises(RuntimeError): repo.remote.fetch_data('origin', branch='master') with pytest.raises(RuntimeError): repo.remote.list_all() with pytest.raises(RuntimeError): repo.remote.ping('origin') with pytest.raises(RuntimeError): repo.remote.push('origin', 'master') with pytest.raises(RuntimeError): repo.remove_branch('master') with pytest.raises(RuntimeError): repo.path with pytest.raises(RuntimeError): repo.version with pytest.raises(RuntimeError): repo.writer_lock_held with pytest.raises(RuntimeError): repo.size_human with pytest.raises(RuntimeError): repo.size_nbytes assert repo._env.repo_is_initialized is False
def branch_create(ctx, name, startpoint): """Create a branch with NAME at STARTPOINT (short-digest or branch) If no STARTPOINT is provided, the new branch is positioned at the HEAD of the staging area branch, automatically. """ from hangar.records.heads import get_branch_head_commit, get_staging_branch_head P = os.getcwd() repo = Repository(path=P) branch_names = repo.list_branches() if name in branch_names: raise ValueError(f'branch name: {name} already exists') if startpoint is None: branch = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch) elif startpoint in branch_names: base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(f'BRANCH: ' + repo.create_branch(name, base_commit=base_commit) + f' HEAD: {base_commit}')
def test_branch_create_and_list(written_two_cmt_server_repo): server, base_repo = written_two_cmt_server_repo co = base_repo.checkout(write=True) cmt = co.commit_hash co.close() runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() new_repo = Repository(P, exists=False) res = runner.invoke( cli.clone, ['--name', 'Foo Tester', '--email', '*****@*****.**', f'{server}'], obj=new_repo) assert res.exit_code == 0 res = runner.invoke(cli.branch_create, ['testbranch'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Created BRANCH: testbranch HEAD: {cmt}\n" branches = new_repo.list_branches() assert branches == ['master', 'origin/master', 'testbranch'] res = runner.invoke(cli.branch_list, obj=new_repo) assert res.exit_code == 0 assert res.stdout == "['master', 'origin/master', 'testbranch']\n"
def view_data(ctx, repo: Repository, column, sample, startpoint, format_, plugin): """Use a plugin to view the data of some SAMPLE in COLUMN at STARTPOINT. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit, get_staging_branch_head from hangar import external kwargs = parse_custom_arguments(ctx.args) if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) elif startpoint: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch_name) co = repo.checkout(commit=base_commit) try: aset = co.columns.get(column) extension = format_.lstrip('.') if format_ else None data = aset[sample] try: external.show(data, plugin=plugin, extension=extension, **kwargs) except Exception as e: raise click.ClickException(e) except KeyError as e: raise click.ClickException(e) finally: co.close()
def branch_create(repo: Repository, name, startpoint): """Create a branch with NAME at STARTPOINT (short-digest or branch) If no STARTPOINT is provided, the new branch is positioned at the HEAD of the staging area branch, automatically. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.records.heads import get_staging_branch_head branch_names = repo.list_branches() if name in branch_names: e = ValueError(f'branch name: {name} already exists') raise click.ClickException(e) try: if startpoint is None: branch = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch) elif startpoint in branch_names: base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) res = repo.create_branch(name, base_commit=base_commit) except (KeyError, ValueError, RuntimeError) as e: raise click.ClickException(e) click.echo(f'Created BRANCH: {res.name} HEAD: {res.digest}')
def fetch_data(repo: Repository, remote, startpoint, column, nbytes, all_): """Get data from REMOTE referenced by STARTPOINT (short-commit or branch). The default behavior is to only download a single commit's data or the HEAD commit of a branch. Please review optional arguments for other behaviors. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.records.heads import get_staging_branch_head from hangar.utils import parse_bytes if startpoint is None: branch = get_staging_branch_head(repo._env.branchenv) commit = get_branch_head_commit(repo._env.branchenv, branch) elif startpoint in repo.list_branches(): commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(f'Fetching data for commit: {commit}') try: max_nbytes = parse_bytes(nbytes) except AttributeError: max_nbytes = None if len(column) == 0: column = None commits = repo.remote.fetch_data(remote=remote, commit=commit, column_names=column, max_num_bytes=max_nbytes, retrieve_all_history=all_) click.echo(f'completed data for commits: {commits}')
def test_push_and_clone_master_linear_history_multiple_commits( server_instance, repo, managed_tmpdir, array5by7, nCommits, nSamples): from hangar import Repository from hangar.records.summarize import list_history cmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32) for cIdx in range(nCommits): if cIdx != 0: co = repo.checkout(write=True) sampList = [] with co.columns['writtenaset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(nSamples): arr = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr sampList.append(arr) cmt = co.commit(f'commit number: {cIdx}') cmtList.append((cmt, sampList)) co.close() masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in cmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'writtenaset' in nco.columns assert len(nco.columns['writtenaset']) == len(sampList) assert nco.columns['writtenaset'].contains_remote_references is True remoteKeys = nco.columns['writtenaset'].remote_reference_keys assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.columns['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.columns['writtenaset'][sIdx] nco.close() cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master') assert cloneMasterHist == masterHist newRepo._env._close_environments()
def test_push_clone_digests_exceeding_server_nbyte_limit( mocker, server_instance_nbytes_limit, repo, managed_tmpdir): from hangar import Repository from hangar.remote import chunks, client # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='aset', shape=(50, 50), dtype=np.float32) for cIdx in range(4): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.columns['aset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(70): arr = np.random.randint(0, 255, size=(50, 50)).astype(np.float32) d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance_nbytes_limit) spy = mocker.spy(chunks, 'tensorChunkedIterator') push1 = repo.remote.push('origin', 'master') assert chunks.tensorChunkedIterator.call_count == 6 for call in spy.call_args_list: assert call[1][ 'uncomp_nbytes'] <= 550_000 # maximum amount over 100_000 observed in test development assert push1 == 'master' # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance_nbytes_limit, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] spy = mocker.spy(client.HangarClient, 'fetch_data') for cmt, sampList in masterCmtList: newRepo.remote.fetch_data('origin', commit=cmt) nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'aset' in nco.columns assert len(nco.columns['aset']) == 70 for sIdx, samp in enumerate(sampList): assert np.allclose(nco.columns['aset'][str(sIdx)], samp) nco.close() del nco assert client.HangarClient.fetch_data.call_count == 8 newRepo._env._close_environments()
def branch_list(ctx): """list all branch names Includes both remote branches as well as local branches. """ P = os.getcwd() repo = Repository(path=P) click.echo(repo.list_branches())
def test_starting_up_repo_warns_should_exist_manual_args(managed_tmpdir): with pytest.warns(UserWarning): repo = Repository(path=managed_tmpdir, exists=True) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) assert repo.list_branches() == ['master'] assert os.path.isdir(repo._repo_path) assert repo._repo_path == os.path.join(managed_tmpdir, '.hangar') co = repo.checkout(write=True) assert co.diff.status() == 'CLEAN' co.close() repo._env._close_environments()
def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, format_, plugin): """Export COLUMN sample data as it existed a STARTPOINT to some format and path. Specifying which sample to be exported is possible by using the switch ``--sample`` (without this, all the samples in the given column will be exported). Since hangar supports both int and str datatype for the sample name, specifying that while mentioning the sample name might be necessary at times. It is possible to do that by separating the name and type by a colon. Example: 1. if the sample name is string of numeric 10 - ``str:10`` or ``10`` 2. if the sample name is ``sample1`` - ``str:sample1`` or ``sample1`` 3. if the sample name is an int, let say 10 - ``int:10`` """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit, get_staging_branch_head from hangar import external kwargs = parse_custom_arguments(ctx.args) if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) elif startpoint: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch_name) co = repo.checkout(commit=base_commit) try: aset = co.columns.get(column) sampleNames = [sample] if sample is not None else list(aset.keys()) extension = format_.lstrip('.') if format_ else None with aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: data = aset[sampleN] formated_sampleN = f'{type(sampleN).__name__}:{sampleN}' try: external.save(data, outdir, formated_sampleN, extension, plugin, **kwargs) except Exception as e: raise click.ClickException(e) except KeyError as e: raise click.ClickException(e) finally: co.close()
def test_push_clone_digests_exceeding_server_nbyte_limit( server_instance, repo, managed_tmpdir): from hangar.remote import config from hangar import Repository config.config['server']['grpc']['fetch_max_nbytes'] = 100_000 config.config['client']['grpc']['push_max_nbytes'] = 100_000 # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.arraysets.init_arrayset(name='aset', shape=(50, 20), dtype=np.float32) for cIdx in range(4): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.arraysets['aset'] as d: for prevKey in list(d.keys())[1:]: d.remove(prevKey) for sIdx in range(70): arr = np.random.randn(50, 20).astype(np.float32) d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: newRepo.remote.fetch_data('origin', commit=cmt) nco = newRepo.checkout(commit=cmt) assert len(nco.arraysets) == 1 assert 'aset' in nco.arraysets assert len(nco.arraysets['aset']) == 70 for sIdx, samp in enumerate(sampList): assert np.allclose(nco.arraysets['aset'][str(sIdx)], samp) nco.close() newRepo._env._close_environments()
def test_push_restricted_with_right_username_password( server_instance_push_restricted, repo, managed_tmpdir): from hangar import Repository # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='aset', shape=(50, 20), dtype=np.float32) for cIdx in range(1): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.columns['aset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(70): arr = np.random.randn(50, 20).astype(np.float32) d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance_push_restricted) push1 = repo.remote.push('origin', 'master', username='******', password='******') assert push1 == 'master' # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance_push_restricted, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: newRepo.remote.fetch_data('origin', commit=cmt) nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'aset' in nco.columns assert len(nco.columns['aset']) == 70 for sIdx, samp in enumerate(sampList): assert np.allclose(nco.columns['aset'][str(sIdx)], samp) nco.close() newRepo._env._close_environments()
def log(repo: Repository, startpoint): """Display commit graph starting at STARTPOINT (short-digest or name) If no argument is passed in, the staging area branch HEAD will be used as the starting point. """ from hangar.records.commiting import expand_short_commit_digest if startpoint is None: click.echo(repo.log()) elif startpoint in repo.list_branches(): click.echo(repo.log(branch=startpoint)) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(repo.log(commit=base_commit))
def log(ctx, startpoint): """Display commit graph starting at STARTPOINT (short-digest or name) If no argument is passed in, the staging area branch HEAD will be used as the starting point. """ P = os.getcwd() repo = Repository(path=P) if startpoint is None: click.echo(repo.log()) elif startpoint in repo.list_branches(): click.echo(repo.log(branch=startpoint)) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(repo.log(commit=base_commit))
def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): """Import file or directory of files at PATH to COLUMN in the staging area. If passing in a directory, all files in the directory will be imported, if passing in a file, just that files specified will be imported """ # TODO: ignore warning through env variable from types import GeneratorType from hangar import external from hangar.records.heads import get_staging_branch_head kwargs = parse_custom_arguments(ctx.args) if branch is None: branch = get_staging_branch_head(repo._env.branchenv) elif branch not in repo.list_branches(): raise click.ClickException( f'Branch name: {branch} does not exist, Exiting.') click.echo(f'Writing to branch: {branch}') co = repo.checkout(write=True, branch=branch) try: active_aset = co.columns.get(column) p = Path(path) files = [f.resolve() for f in p.iterdir()] if p.is_dir() else [p.resolve()] with active_aset as aset, click.progressbar(files) as filesBar: for f in filesBar: ext = ''.join(f.suffixes).strip( '.') # multi-suffix files (tar.bz2) loaded = external.load(f, plugin=plugin, extension=ext, **kwargs) if not isinstance(loaded, GeneratorType): loaded = [loaded] for arr, fname in loaded: if (not overwrite) and (fname in aset): continue try: aset[fname] = arr except ValueError as e: click.echo(e) except (ValueError, KeyError) as e: raise click.ClickException(e) finally: co.close()
def summary(repo: Repository, startpoint): """Display content summary at STARTPOINT (short-digest or branch). If no argument is passed in, the staging area branch HEAD wil be used as the starting point. In order to recieve a machine readable, and more complete version of this information, please see the ``Repository.summary()`` method of the API. """ from hangar.records.commiting import expand_short_commit_digest if startpoint is None: click.echo(repo.summary()) elif startpoint in repo.list_branches(): click.echo(repo.summary(branch=startpoint)) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(repo.summary(commit=base_commit))
def import_data(repo: Repository, arrayset, path, branch, plugin, overwrite): """Import file(s) at PATH to ARRAYSET in the staging area. """ from hangar.cli.io import imread from hangar.records.heads import get_staging_branch_head try: if branch is not None: if branch in repo.list_branches(): branch_name = branch else: click.echo(f'Branch name: {branch} does not exist, Exiting.') return None else: branch_name = get_staging_branch_head(repo._env.branchenv) click.echo(f'Writing to branch: {branch_name}') with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) co = repo.checkout(write=True, branch=branch_name) aset = co.arraysets.get(arrayset) if os.path.isfile(path): fname = os.path.basename(path) if not overwrite: if fname in aset: click.echo(f'skipping existing name: {fname} as overwrite flag not set') return None fNamePth = [(fname, path)] else: fnames = os.listdir(path) if not overwrite: fnames = [fname for fname in fnames if fname not in aset] fNamePth = [(fname, os.path.join(path, fname)) for fname in fnames] with aset as a, click.progressbar(fNamePth) as fnamesBar: for fn, fpth in fnamesBar: arr = imread(fpth, plugin=plugin) try: a[fn] = arr except ValueError as e: click.echo(e) finally: co.close()
def fetch_data(ctx, remote, startpoint, aset, nbytes, all_): """Get data from REMOTE referenced by STARTPOINT (short-commit or branch). The default behavior is to only download a single commit's data or the HEAD commit of a branch. Please review optional arguments for other behaviors """ from hangar.records.heads import get_branch_head_commit, get_staging_branch_head from hangar.utils import parse_bytes P = os.getcwd() repo = Repository(path=P) if startpoint is None: branch = get_staging_branch_head(repo._env.branchenv) commit = get_branch_head_commit(repo._env.branchenv, branch) click.echo( f'No startpoint supplied, fetching data of HEAD: {commit} for BRANCH: {branch}' ) elif startpoint in repo.list_branches(): commit = get_branch_head_commit(repo._env.branchenv, startpoint) click.echo( f'Fetching data for HEAD: {commit} of STARTPOINT BRANCH: {startpoint}' ) else: commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(f'Fetching data for STARTPOINT HEAD: {commit}') click.echo(f'aset argument: {aset}') try: max_nbytes = parse_bytes(nbytes) click.echo(f'nbytes argument: {max_nbytes}') except AttributeError: max_nbytes = None if len(aset) == 0: aset = None commits = repo.remote.fetch_data(remote=remote, commit=commit, arrayset_names=aset, max_num_bytes=max_nbytes, retrieve_all_history=all_) click.echo(f'completed data for commits: {commits}')
def export_data(repo: Repository, startpoint, arrayset, out, sample, format_, plugin): """export ARRAYSET sample data as it existed a STARTPOINT to some format and path. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.cli.io import imsave if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) try: co = repo.checkout(write=False, commit=base_commit) arrayset = co.arraysets[arrayset] if sample: sampleNames = [sample] else: sampleNames = list(arrayset.keys()) if format_: format_ = format_.lstrip('.') outP = os.path.expanduser(os.path.normpath(out)) with arrayset as aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: if format_: if sampleN.endswith(format_): outFP = os.path.join(outP, f'{sampleN}') else: outFP = os.path.join(outP, f'{sampleN}.{format_}') else: outFP = os.path.join(outP, f'{sampleN}') try: data = aset[sampleN] imsave(outFP, data) except KeyError as e: click.echo(e) finally: co.close()
def view_data(repo: Repository, startpoint, arrayset, sample, plugin): """Use a plugin to view the data of some SAMPLE in ARRAYSET at STARTPOINT. """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit from hangar.cli.io import imshow, show if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) try: co = repo.checkout(write=False, commit=base_commit) arrayset = co.arraysets[arrayset] try: data = arrayset[sample] imshow(data, plugin=plugin) show() except KeyError as e: click.echo(e) finally: co.close()
class Hinterface(object): """ Interface class to interact with hangar repositories. It enables the APIs to ignore the internals of hangar and can utilize the high level functions """ def __init__(self, path, branch='master', arrayset_name=None, sample_name=None): if not path.exists(): raise FileNotFoundError("Repository does not exist") self.repo = Repository(path) # TODO: fix hangar's version compatibility check if not self.repo.initialized: raise RuntimeError("Repository not initialized") self.branch = branch self.arrayset_name = arrayset_name self.sample_name = sample_name self.rcheckout = self.repo.checkout(branch=self.branch) @classmethod def create_repo(cls, path, username, email, desc=None, create_path=True): # TODO: Remove if it is not necessary if not path.exists() and create_path: path.mkdir() repo = Repository(path) repo.init(username, email) @property def repo_details(self): cmt_details = self.repo.log(return_contents=True) # TODO: make sure pop returns the latest try: top = cmt_details['order'].pop() except IndexError: cmt_time = None else: cmt_time = cmt_details['specs'][top]['commit_time'] return { "last_commit_time": cmt_time, "total_commit_count": len(cmt_details["order"]), "branch_count": len(self.repo.list_branches()), "hangar_version": self.repo.version } @property def arraysets(self): if self.arrayset_name: yield self.rcheckout.arraysets[self.arrayset_name] else: for val in self.rcheckout.arraysets.values(): yield val @property def sample_names(self): if self.arrayset_name: aset = self.rcheckout[self.arrayset_name] yield aset.name, list(aset.keys()) else: for aset in self.rcheckout.arraysets.values(): yield aset.name, list(aset.keys()) def get_samples(self, plugin_name=None): pass
def test_server_push_two_branch_then_clone_fetch_data_options( server_instance, repo, managed_tmpdir, array5by7, nMasterCommits, nMasterSamples, nDevCommits, nDevSamples, fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history): from hangar import Repository from hangar.records.summarize import list_history # Push master branch test masterCmts = {} co = repo.checkout(write=True) co.arraysets.init_arrayset(name='writtenaset', shape=(5, 7), dtype=np.float32) co.arraysets.init_arrayset(name='_two', shape=(20), dtype=np.float32) for cIdx in range(nMasterCommits): if cIdx != 0: co = repo.checkout(write=True) masterSampList1 = [] masterSampList2 = [] with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd: for prevKey in list(d.keys())[1:]: d.remove(prevKey) dd.remove(prevKey) for sIdx in range(nMasterSamples): arr1 = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr1 masterSampList1.append(arr1) arr2 = np.random.randn(20).astype(np.float32) dd[str(sIdx)] = arr2 masterSampList2.append(arr2) cmt = co.commit(f'master commit number: {cIdx}') masterCmts[cmt] = (masterSampList1, masterSampList2) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') # Push dev branch test devCmts = masterCmts.copy() branch = repo.create_branch('testbranch') for cIdx in range(nDevCommits): co = repo.checkout(write=True, branch=branch.name) devSampList1 = [] devSampList2 = [] with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd: for prevKey in list(d.keys())[1:]: d.remove(prevKey) dd.remove(prevKey) for sIdx in range(nDevSamples): arr1 = np.random.randn(*array5by7.shape).astype( np.float32) * 100 d[str(sIdx)] = arr1 devSampList1.append(arr1) arr2 = np.random.randn(20).astype(np.float32) dd[str(sIdx)] = arr2 devSampList2.append(arr2) cmt = co.commit(f'dev commit number: {cIdx}') devCmts[cmt] = (devSampList1, devSampList2) co.close() push2 = repo.remote.push('origin', branch.name) assert push2 == branch.name branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name) # -------------------------- end setup ------------------------------------ # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) newRepo.remote.fetch('origin', branch=branch.name) newRepo.create_branch('testbranch', base_commit=branchHist['head']) assert newRepo.list_branches() == [ 'master', 'origin/master', f'origin/{branch.name}', branch.name ] # ------------------ format arguments dependingon options ----------------- kwargs = { 'arrayset_names': fetchAsetns, 'max_num_bytes': fetchNbytes, 'retrieve_all_history': fetchAll_history, } if fetchBranch is not None: func = branchHist if fetchBranch == 'testbranch' else masterHist kwargs['branch'] = fetchBranch kwargs['commit'] = None else: func = branchHist if fetchBranch == 'br' else masterHist kwargs['branch'] = None kwargs['commit'] = func['head'] if fetchAll_history is True: commits_to_check = func['order'] else: commits_to_check = [func['head']] # ----------------------- retrieve data with desired options -------------- # This case should fail if (fetchAll_history is True) and isinstance(fetchNbytes, int): try: with pytest.raises(ValueError): fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) finally: newRepo._env._close_environments() return True # get data fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) assert commits_to_check == fetch_commits # ------------- check that you got everything you expected ---------------- for fCmt in fetch_commits: co = newRepo.checkout(commit=fCmt) assert co.commit_hash == fCmt # when we are checking one aset only if isinstance(fetchAsetns, tuple): d = co.arraysets[fetchAsetns[0]] # ensure we didn't fetch the other data simultaneously ds1SampList, ds2SampList = devCmts[fCmt] if fetchAsetns[0] == 'writtenaset': compare = ds1SampList else: compare = ds2SampList totalSeen = 0 for idx, samp in enumerate(compare): if fetchNbytes is None: assert np.allclose(samp, d[str(idx)]) else: try: arr = d[str(idx)] assert np.allclose(samp, arr) totalSeen += arr.nbytes except FileNotFoundError: pass assert totalSeen <= fetchNbytes # compare both asets at the same time else: d = co.arraysets['writtenaset'] dd = co.arraysets['_two'] ds1List, ds2List = devCmts[fCmt] totalSeen = 0 for idx, ds1ds2 in enumerate(zip(ds1List, ds2List)): ds1, ds2 = ds1ds2 if fetchNbytes is None: assert np.allclose(ds1, d[str(idx)]) assert np.allclose(ds2, dd[str(idx)]) else: try: arr1 = d[str(idx)] assert np.allclose(ds1, arr1) totalSeen += arr1.nbytes except FileNotFoundError: pass try: arr2 = dd[str(idx)] assert np.allclose(ds2, arr2) totalSeen += arr2.nbytes except FileNotFoundError: pass assert totalSeen <= fetchNbytes co.close() newRepo._env._close_environments()
def test_server_push_second_branch_with_new_commit_then_clone_partial_fetch( server_instance, repo, managed_tmpdir, array5by7, nMasterCommits, nMasterSamples, nDevCommits, nDevSamples): from hangar import Repository from hangar.records.summarize import list_history # Push master branch test masterCmtList = [] co = repo.checkout(write=True) co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32) for cIdx in range(nMasterCommits): if cIdx != 0: co = repo.checkout(write=True) masterSampList = [] with co.columns['writtenaset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(nMasterSamples): arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100 d[str(sIdx)] = arr masterSampList.append(arr) cmt = co.commit(f'master commit number: {cIdx}') masterCmtList.append((cmt, masterSampList)) co.close() repo.remote.add('origin', server_instance) push1 = repo.remote.push('origin', 'master') assert push1 == 'master' masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master') # Push dev branch test devCmtList = [] branch = repo.create_branch('testbranch') for cIdx in range(nDevCommits): co = repo.checkout(write=True, branch=branch.name) devSampList = [] with co.columns['writtenaset'] as d: for prevKey in list(d.keys())[1:]: del d[prevKey] for sIdx in range(nDevSamples): arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100 d[str(sIdx)] = arr devSampList.append(arr) cmt = co.commit(f'dev commit number: {cIdx}') devCmtList.append((cmt, devSampList)) co.close() push2 = repo.remote.push('origin', branch.name) assert push2 == branch.name branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name) # Clone test (master branch) new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) assert newRepo.list_branches() == ['master', 'origin/master'] for cmt, sampList in masterCmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'writtenaset' in nco.columns assert len(nco.columns['writtenaset']) == nMasterSamples assert nco.columns['writtenaset'].contains_remote_references is True remoteKeys = nco.columns['writtenaset'].remote_reference_keys assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.columns['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.columns['writtenaset'][sIdx] nco.close() cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master') assert cloneMasterHist == masterHist # Fetch test fetch = newRepo.remote.fetch('origin', branch=branch.name) assert fetch == f'origin/{branch.name}' assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}'] for cmt, sampList in devCmtList: with pytest.warns(UserWarning): nco = newRepo.checkout(commit=cmt) assert len(nco.columns) == 1 assert 'writtenaset' in nco.columns assert len(nco.columns['writtenaset']) == nDevSamples assert nco.columns['writtenaset'].contains_remote_references is True remoteKeys = nco.columns['writtenaset'].remote_reference_keys assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys for idx, _ in enumerate(sampList): sIdx = str(idx) assert sIdx in nco.columns['writtenaset'] with pytest.raises(FileNotFoundError): shouldNotExist = nco.columns['writtenaset'][sIdx] nco.close() cloneBranchHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name=f'origin/{branch.name}') assert cloneBranchHist == branchHist newRepo._env._close_environments()
def branch_list(repo: Repository): """List all branch names. Includes both remote branches as well as local branches. """ click.echo(repo.list_branches())
def test_server_push_two_branch_then_clone_fetch_data_options( self, two_branch_multi_commit_repo_class, managed_tmpdir_class, array5by7_class, fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history, tmp_path_factory): from hangar import Repository from operator import eq branch, branchHist, devCmts, masterHist, server_instance = two_branch_multi_commit_repo_class # Clone test (master branch) _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True) new_tmpdir = str(_new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) newRepo.remote.fetch('origin', branch=branch.name) newRepo.create_branch('testbranch', base_commit=branchHist['head']) assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}', branch.name] # ------------------ format arguments depending on options ----------------- kwargs = { 'column_names': fetchAsetns, 'max_num_bytes': fetchNbytes, 'retrieve_all_history': fetchAll_history, } if fetchBranch is not None: func = branchHist if fetchBranch == 'testbranch' else masterHist kwargs['branch'] = fetchBranch kwargs['commit'] = None else: func = branchHist if fetchBranch == 'br' else masterHist kwargs['branch'] = None kwargs['commit'] = func['head'] if fetchAll_history is True: commits_to_check = func['order'] else: commits_to_check = [func['head']] # ----------------------- retrieve data with desired options -------------- # This case should fail if (fetchAll_history is True) and isinstance(fetchNbytes, int): try: with pytest.raises(ValueError): fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) finally: newRepo._env._close_environments() return True # get data fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs) assert commits_to_check == fetch_commits # ------------- check that you got everything you expected ---------------- for fCmt in fetch_commits: co = newRepo.checkout(commit=fCmt) assert co.commit_hash == fCmt # when we are checking one aset only if isinstance(fetchAsetns, tuple): d = co.columns[fetchAsetns[0]] # ensure we didn't fetch the other data simultaneously ds1SampList, ds2SampList, ds3SampList, ds4SampList = devCmts[fCmt] if fetchAsetns[0] == 'writtenaset': compare = ds1SampList cmp_func = np.allclose elif fetchAsetns[0] == '_two': compare = ds2SampList cmp_func = np.allclose elif fetchAsetns[0] == 'str_col': compare = ds3SampList cmp_func = eq else: compare = ds4SampList cmp_func = eq totalSeen = 0 for idx, samp in enumerate(compare): if fetchNbytes is None: assert cmp_func(samp, d[str(idx)]) else: try: arr = d[str(idx)] assert cmp_func(samp, arr) try: totalSeen += arr.nbytes except AttributeError: totalSeen += len(arr) except FileNotFoundError: pass assert totalSeen <= fetchNbytes # compare both asets at the same time else: d = co.columns['writtenaset'] dd = co.columns['_two'] str_col = co.columns['str_col'] bytes_col = co.columns['bytes_col'] ds1List, ds2List, ds3List, ds4List = devCmts[fCmt] totalSeen = 0 for idx, ds1ds2ds3ds4 in enumerate(zip(ds1List, ds2List, ds3List, ds4List)): ds1, ds2, ds3, ds4 = ds1ds2ds3ds4 if fetchNbytes is None: assert np.allclose(ds1, d[str(idx)]) assert np.allclose(ds2, dd[str(idx)]) assert ds3 == str_col[str(idx)] assert ds4 == bytes_col[str(idx)] else: try: arr1 = d[str(idx)] assert np.allclose(ds1, arr1) totalSeen += arr1.nbytes except FileNotFoundError: pass try: arr2 = dd[str(idx)] assert np.allclose(ds2, arr2) totalSeen += arr2.nbytes except FileNotFoundError: pass try: sval = str_col[str(idx)] assert ds3 == sval totalSeen += len(sval.encode()) except FileNotFoundError: pass try: bval = bytes_col[str(idx)] assert ds4 == bval totalSeen += len(bval) except FileNotFoundError: pass assert totalSeen <= fetchNbytes co.close() newRepo._env._close_environments()