def test_get_ecosystem_details(managed_tmpdir): repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) eco = repo._ecosystem_details() assert isinstance(eco, dict) assert 'host' in eco assert 'packages' in eco for package_name, version in eco['packages']: assert version is not None repo._env._close_environments()
def clone(repo: Repository, remote, name, email, overwrite): """Initialize a repository at the current path and fetch updated records from REMOTE. Note: This method does not actually download the data to disk. Please look into the ``fetch-data`` command. """ if repo.initialized and (not overwrite): click.echo(f'Repo already exists at: {repo.path}') else: repo.clone(name, email, remote, remove_old=overwrite)
def init(repo: Repository, name, email, overwrite, description=None): """Initialize an empty repository at the current path """ if repo.initialized and (not overwrite): click.echo(f'Repo already exists at: {repo.path}') else: repo.init(user_name=name, user_email=email, remove_old=overwrite, description=description)
def init(ctx, name, email, overwrite): """Initialize an empty repository at the current path """ P = os.getcwd() repo = Repository(path=P, exists=False) try: repo.init(user_name=name, user_email=email, remove_old=overwrite) click.echo(f'Hangar repository initialized at {P}') except OSError as e: click.echo(e)
def test_init_repo(): runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() repo = Repository(P, exists=False) res = runner.invoke(cli.init, ['--name', 'test', '--email', '*****@*****.**'], obj=repo) assert res.exit_code == 0 assert repo._Repository__verify_repo_initialized() is None
def test_checkout_writer_branch_works(dummy_repo: Repository): from hangar.records.heads import get_staging_branch_head dummy_repo.create_branch('dev') runner = CliRunner() res = runner.invoke(cli.checkout, ['dev'], obj=dummy_repo) assert res.exit_code == 0 assert res.stdout == 'Writer checkout head set to branch: dev\n' recorded_branch = get_staging_branch_head(dummy_repo._env.branchenv) assert recorded_branch == 'dev' assert dummy_repo.writer_lock_held is False
def clone(remote, uname, email, overwrite): if isinstance(uname, (list, tuple)): uname = ' '.join(uname) P = os.getcwd() repo = Repository(path=P) repo.clone(user_name=uname, user_email=email, remote_address=remote, remove_old=overwrite) click.echo(f'Hangar repository initialized at {P}')
def test_fetch_records_and_data(server_instance, backend, options): runner = CliRunner() with runner.isolated_filesystem(): repo = Repository(getcwd(), exists=False) repo.init('foo', 'bar') dummyData = np.arange(50) co1 = repo.checkout(write=True, branch='master') co1.arraysets.init_arrayset(name='dummy', prototype=dummyData, named_samples=True, backend=backend) for idx in range(10): dummyData[:] = idx co1.arraysets['dummy'][str(idx)] = dummyData co1.metadata['hello'] = 'world' co1.metadata['somemetadatakey'] = 'somemetadatavalue' cmt1 = co1.commit('first commit adding dummy data and hello meta') co1.close() repo.create_branch('testbranch') co2 = repo.checkout(write=True, branch='testbranch') for idx in range(10, 20): dummyData[:] = idx co2.arraysets['dummy'][str(idx)] = dummyData co2.metadata['foo'] = 'bar' cmt2 = co2.commit( 'first commit on test branch adding non-conflict data and meta') co2.close() repo.remote.add('origin', server_instance) res = runner.invoke(cli.push, ['origin', 'master'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.push, ['origin', 'testbranch'], obj=repo) assert res.exit_code == 0 with runner.isolated_filesystem(): repo = Repository(getcwd(), exists=False) res = runner.invoke(cli.clone, [ '--name', 'Foo Tester', '--email', '*****@*****.**', f'{server_instance}' ], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.fetch_records, ['origin', 'testbranch'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.branch_create, ['testbranch', 'origin/testbranch'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.fetch_data, options, obj=repo) assert res.exit_code == 0
class CheckoutCommit(object): params = [(5_000, 20_000), (5_000, 20_000)] param_names = ['num_samples', 'num_metadata'] processes = 2 number = 1 repeat = (2, 4, 20) warmup_time = 0 def setup(self, num_samples, num_metadata): self.tmpdir = mkdtemp() self.repo = Repository(path=self.tmpdir, exists=False) self.repo.init('tester', '*****@*****.**', remove_old=True) self.co = self.repo.checkout(write=True) arr = np.array([ 0, ], dtype=np.uint8) try: aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10') except TypeError: aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10') except AttributeError: aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10') with aset as cm_aset: for i in range(num_samples): arr[:] = i % 255 cm_aset[i] = arr with self.co.metadata as cm_meta: for i in range(num_metadata): cm_meta[i] = f'{i % 500} data' self.co.commit('first') self.co.close() self.co = None def teardown(self, num_samples, num_metadata): try: self.co.close() except PermissionError: pass self.repo._env._close_environments() rmtree(self.tmpdir) def time_checkout_read_only(self, num_samples, num_metadata): self.co = self.repo.checkout(write=False) def time_checkout_write_enabled(self, num_samples, num_metadata): self.co = self.repo.checkout(write=True) self.co.close()
def test_init_repo(managed_tmpdir): runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() try: repo = Repository(P, exists=False) res = runner.invoke(cli.init, ['--name', 'test', '--email', '*****@*****.**'], obj=repo) assert res.exit_code == 0 assert repo._Repository__verify_repo_initialized() is None finally: repo._env._close_environments()
def test_summary_before_commit_made(managed_tmpdir): runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() new_repo = Repository(P, exists=False) new_repo.init('Test User', '*****@*****.**') try: res = runner.invoke(cli.summary, obj=new_repo) assert res.exit_code == 0 assert 'No commits have been made in the repository' in res.stdout finally: new_repo._env._close_environments
def setup(self): # self.method # self.num_samples # self.sample_shape self.current_iter_number = 0 self.tmpdir = mkdtemp() self.repo = Repository(path=self.tmpdir, exists=False) self.repo.init('tester', '*****@*****.**', remove_old=True) self.co = self.repo.checkout(write=True) component_arrays = [] ndims = len(self.sample_shape) for idx, shape in enumerate(self.sample_shape): layout = [1 for i in range(ndims)] layout[idx] = shape component = np.hamming(shape).reshape(*layout) * 100 component_arrays.append(component.astype(np.float32)) arr = np.prod(component_arrays).astype(np.float32) try: self.aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='01') except TypeError: try: self.aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='01') except ValueError: raise NotImplementedError except ValueError: # marks as skipped benchmark for commits which do not have this backend. raise NotImplementedError except AttributeError: self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='01') if self.method == 'read': with self.aset as cm_aset: for i in range(self.num_samples): arr[0, 0, 0] += 1 cm_aset[i] = arr self.co.commit('first commit') self.co.close() self.co = self.repo.checkout(write=False) try: self.aset = self.co.columns['aset'] except AttributeError: self.aset = self.co.arraysets['aset'] else: self.arr = arr
def test_initial_arrayset(managed_tmpdir, randomsizedarray): repo = Repository(path=managed_tmpdir, exists=False) repo.init(user_name='tester', user_email='*****@*****.**', remove_old=True) w_checkout = repo.checkout(write=True) assert len(w_checkout.columns) == 0 with pytest.raises(KeyError): w_checkout.columns['aset'] aset = w_checkout.add_ndarray_column('aset', prototype=randomsizedarray) assert aset.column == 'aset' w_checkout.close() repo._env._close_environments()
def test_local_without_data_fails_data_unavailable(self, written_two_cmt_server_repo, managed_tmpdir): new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) server, _ = written_two_cmt_server_repo repo = Repository(path=new_tmpdir, exists=False) repo.clone('name', '[email protected]', server, remove_old=True) co = repo.checkout() aset = co.arraysets['writtenaset'] with pytest.raises(FileNotFoundError): tf_dset = make_tf_dataset(aset, keys=['1', '2']) co.close() repo._env._close_environments()
def test_check_repository_software_version_startup(managed_tmpdir): from hangar import Repository, __version__ from pkg_resources import parse_version repo = Repository(managed_tmpdir, exists=False) repo.init('test user', '*****@*****.**', remove_old=True) repo._env._close_environments() nrepo = Repository(managed_tmpdir, exists=True) assert nrepo.initialized is True assert nrepo.version == parse_version(__version__).base_version nrepo._env._close_environments()
def test_local_without_data_fails_no_common_no_local(self, written_two_cmt_server_repo, managed_tmpdir): new_tmpdir = pjoin(managed_tmpdir, 'new') mkdir(new_tmpdir) server, _ = written_two_cmt_server_repo repo = Repository(path=new_tmpdir, exists=False) repo.clone('name', '[email protected]', server, remove_old=True) co = repo.checkout() aset = co.arraysets['writtenaset'] with pytest.raises(ValueError): torch_dset = make_torch_dataset(aset) co.close() repo._env._close_environments()
def clone(ctx, remote, name, email, overwrite): """Initialize a repository at the current path and fetch updated records from REMOTE. Note: This method does not actually download the data to disk. Please look into the ``fetch-data`` command. """ P = os.getcwd() repo = Repository(path=P, exists=False) repo.clone(user_name=name, user_email=email, remote_address=remote, remove_old=overwrite) click.echo(f'Hangar repository initialized at {P}')
def test_push_fetch_records(server_instance, backend): runner = CliRunner() with runner.isolated_filesystem(): repo = Repository(getcwd(), exists=False) try: repo.init('foo', 'bar') dummyData = np.arange(50) co1 = repo.checkout(write=True, branch='master') co1.add_ndarray_column(name='dummy', prototype=dummyData, backend=backend) for idx in range(10): dummyData[:] = idx co1.columns['dummy'][str(idx)] = dummyData cmt1 = co1.commit('first commit adding dummy data') co1.close() repo.create_branch('testbranch') co2 = repo.checkout(write=True, branch='testbranch') for idx in range(10, 20): dummyData[:] = idx co2.columns['dummy'][str(idx)] = dummyData cmt2 = co2.commit( 'first commit on test branch adding non-conflict data') co2.close() repo.remote.add('origin', server_instance) res = runner.invoke(cli.push, ['origin', 'master'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.push, ['origin', 'testbranch'], obj=repo) assert res.exit_code == 0 finally: repo._env._close_environments()
def liberate(): """ Release the writer lock forcefully and make the repository available for writing. Warning ------- If another process, that has the writer lock, is writing to the repo, releasing the lock leads to an exception in that process. Use it carefully """ repo = Repository(Path.cwd(), exists=True) if repo.force_release_writer_lock(): click.echo("Writer lock released") else: click.echo("Error while attempting to release the writer lock")
def init(uname, email, overwrite): P = os.getcwd() if isinstance(uname, (list, tuple)): uname = ' '.join(uname) repo = Repository(path=P) if overwrite: repoDir = repo.init(user_name=uname, user_email=email, remove_old=True) else: try: repoDir = repo.init(user_name=uname, user_email=email, remove_old=False) except OSError as e: click.echo(e)
def test_check_repository_software_version_fails_on_older_repo(managed_tmpdir): from hangar import Repository from hangar.records.vcompat import set_repository_software_version repo = Repository(managed_tmpdir, exists=False) repo.init('test user', '*****@*****.**', remove_old=True) # force writing of new software version. should trigger error on next read. set_repository_software_version(repo._env.branchenv, '0.2.0', overwrite=True) repo._env._close_environments() with pytest.raises(RuntimeError): Repository(managed_tmpdir, exists=True)
def export_data(ctx, repo: Repository, column, outdir, startpoint, sample, format_, plugin): """Export COLUMN sample data as it existed a STARTPOINT to some format and path. Specifying which sample to be exported is possible by using the switch ``--sample`` (without this, all the samples in the given column will be exported). Since hangar supports both int and str datatype for the sample name, specifying that while mentioning the sample name might be necessary at times. It is possible to do that by separating the name and type by a colon. Example: 1. if the sample name is string of numeric 10 - ``str:10`` or ``10`` 2. if the sample name is ``sample1`` - ``str:sample1`` or ``sample1`` 3. if the sample name is an int, let say 10 - ``int:10`` """ from hangar.records.commiting import expand_short_commit_digest from hangar.records.heads import get_branch_head_commit, get_staging_branch_head from hangar import external kwargs = parse_custom_arguments(ctx.args) if startpoint in repo.list_branches(): base_commit = get_branch_head_commit(repo._env.branchenv, startpoint) elif startpoint: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) else: branch_name = get_staging_branch_head(repo._env.branchenv) base_commit = get_branch_head_commit(repo._env.branchenv, branch_name) co = repo.checkout(commit=base_commit) try: aset = co.columns.get(column) sampleNames = [sample] if sample is not None else list(aset.keys()) extension = format_.lstrip('.') if format_ else None with aset, click.progressbar(sampleNames) as sNamesBar: for sampleN in sNamesBar: data = aset[sampleN] formated_sampleN = f'{type(sampleN).__name__}:{sampleN}' try: external.save(data, outdir, formated_sampleN, extension, plugin, **kwargs) except Exception as e: raise click.ClickException(e) except KeyError as e: raise click.ClickException(e) finally: co.close()
def writer_lock_held(repo: Repository, force_release_): """Determine if the writer lock is held for a repository. Passing the ``--force-release`` flag will instantly release the writer lock, invalidating any process which currently holds it. """ if force_release_: repo.force_release_writer_lock() click.echo(f'Success force release of writer lock.') else: if repo.writer_lock_held: click.echo(f'Writer lock is held.') else: click.echo(f'Writer lock is available.')
def test_server_fetch_data_sample( self, two_multi_format_repo_class, managed_tmpdir_class, fetchOp, column_name, keys, tmp_path_factory ): from hangar import Repository cmt, server_instance = two_multi_format_repo_class # Clone test (master branch) _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True) new_tmpdir = str(_new_tmpdir) newRepo = Repository(path=new_tmpdir, exists=False) newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True) # ------------------ format arguments depending on options ----------------- kwargs = { 'column': column_name, 'samples': keys } if fetchOp == 'branch': kwargs['branch'] = 'master' elif fetchOp == 'commit': kwargs['commit'] = cmt else: raise ValueError(f'fetchOp unknown: {fetchOp}') fetch_commit = newRepo.remote.fetch_data_sample(remote='origin', **kwargs) assert fetch_commit == cmt co = newRepo.checkout() try: col = co[column_name] if isinstance(keys, (list, tuple)): if column_name.endswith('flat'): for key in keys: assert col[key] is not None else: for sample in keys: if isinstance(sample, (list, tuple)): if len(sample) == 2: assert col[sample[0]][sample[1]] is not None elif len(sample) == 1: assert col[sample[0]][...] is not None else: assert col[sample][...] is not None finally: co.close() newRepo._env._close_environments()
def __init__(self, path, branch='master', arrayset_name=None, sample_name=None): if not path.exists(): raise FileNotFoundError("Repository does not exist") self.repo = Repository(path) # TODO: fix hangar's version compatibility check if not self.repo.initialized: raise RuntimeError("Repository not initialized") self.branch = branch self.arrayset_name = arrayset_name self.sample_name = sample_name self.rcheckout = self.repo.checkout(branch=self.branch)
def log(repo: Repository, startpoint): """Display commit graph starting at STARTPOINT (short-digest or name) If no argument is passed in, the staging area branch HEAD will be used as the starting point. """ from hangar.records.commiting import expand_short_commit_digest if startpoint is None: click.echo(repo.log()) elif startpoint in repo.list_branches(): click.echo(repo.log(branch=startpoint)) else: base_commit = expand_short_commit_digest(repo._env.refenv, startpoint) click.echo(repo.log(commit=base_commit))
def test_check_repository_software_version_works_on_newer_hangar_version(managed_tmpdir, monkeypatch, futureVersion): from hangar import Repository repo = Repository(managed_tmpdir, exists=False) repo.init('test user', '*****@*****.**', remove_old=True) old_version = repo.version # force writing of new software version. should trigger error on next read. repo._env._close_environments() import hangar monkeypatch.setattr(hangar, '__version__', futureVersion) nrepo = Repository(managed_tmpdir, exists=True) assert hangar.__version__ == futureVersion assert nrepo.version == old_version nrepo._env._close_environments()
def import_data(ctx, repo: Repository, column, path, branch, plugin, overwrite): """Import file or directory of files at PATH to COLUMN in the staging area. If passing in a directory, all files in the directory will be imported, if passing in a file, just that files specified will be imported """ # TODO: ignore warning through env variable from types import GeneratorType from hangar import external from hangar.records.heads import get_staging_branch_head kwargs = parse_custom_arguments(ctx.args) if branch is None: branch = get_staging_branch_head(repo._env.branchenv) elif branch not in repo.list_branches(): raise click.ClickException( f'Branch name: {branch} does not exist, Exiting.') click.echo(f'Writing to branch: {branch}') co = repo.checkout(write=True, branch=branch) try: active_aset = co.columns.get(column) p = Path(path) files = [f.resolve() for f in p.iterdir()] if p.is_dir() else [p.resolve()] with active_aset as aset, click.progressbar(files) as filesBar: for f in filesBar: ext = ''.join(f.suffixes).strip( '.') # multi-suffix files (tar.bz2) loaded = external.load(f, plugin=plugin, extension=ext, **kwargs) if not isinstance(loaded, GeneratorType): loaded = [loaded] for arr, fname in loaded: if (not overwrite) and (fname in aset): continue try: aset[fname] = arr except ValueError as e: click.echo(e) except (ValueError, KeyError) as e: raise click.ClickException(e) finally: co.close()
def test_branch_create_and_delete(written_two_cmt_server_repo): server, base_repo = written_two_cmt_server_repo co = base_repo.checkout(write=True) cmt = co.commit_hash co.close() runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() new_repo = Repository(P, exists=False) try: res = runner.invoke(cli.clone, [ '--name', 'Foo Tester', '--email', '*****@*****.**', f'{server}' ], obj=new_repo) assert res.exit_code == 0 res = runner.invoke(cli.branch_create, ['testbranch'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Created BRANCH: testbranch HEAD: {cmt}\n" branches = new_repo.list_branches() assert branches == ['master', 'origin/master', 'testbranch'] res = runner.invoke(cli.branch_remove, ['testbranch'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Deleted BRANCH: testbranch HEAD: {cmt}\n" branches = new_repo.list_branches() assert branches == ['master', 'origin/master'] new_repo.create_branch('secondtest') co = new_repo.checkout(write=True, branch='secondtest') co.add_str_column('test_meta') newDigest = co.commit('dummy commit') co.close() # re-open with staging set to master so we can try to delete secondtest co = new_repo.checkout(write=True, branch='master') co.close() res = runner.invoke(cli.branch_remove, ['secondtest'], obj=new_repo) assert res.exit_code == 1 res = runner.invoke(cli.branch_remove, ['secondtest', '-f'], obj=new_repo) assert res.exit_code == 0 assert res.stdout == f"Deleted BRANCH: secondtest HEAD: {newDigest}\n" res = runner.invoke(cli.branch_list, obj=new_repo) assert res.exit_code == 0 assert res.stdout == "['master', 'origin/master']\n" finally: new_repo._env._close_environments()
def test_list_all_remotes(managed_tmpdir): from hangar.remotes import RemoteInfo runner = CliRunner() with runner.isolated_filesystem(): P = getcwd() repo = Repository(P, exists=False) res = runner.invoke(cli.init, ['--name', 'test', '--email', '*****@*****.**'], obj=repo) assert res.exit_code == 0 res = runner.invoke(cli.add_remote, ['origin', 'localhost:50051'], obj=repo) assert res.exit_code == 0 assert res.stdout == "RemoteInfo(name='origin', address='localhost:50051')\n" res = runner.invoke(cli.add_remote, ['upstream', 'foo:ip'], obj=repo) assert res.exit_code == 0 assert res.stdout == "RemoteInfo(name='upstream', address='foo:ip')\n" remote_list = repo.remote.list_all() assert remote_list == [ RemoteInfo(name='origin', address='localhost:50051'), RemoteInfo(name='upstream', address='foo:ip') ] res = runner.invoke(cli.list_remotes, obj=repo) assert res.exit_code == 0 expected_stdout = "[RemoteInfo(name='origin', address='localhost:50051'), "\ "RemoteInfo(name='upstream', address='foo:ip')]\n" assert res.stdout == expected_stdout