Esempio n. 1
0
def test_push_and_clone_master_linear_history_multiple_commits(
        server_instance, repo, managed_tmpdir, array5by7, nCommits, nSamples):
    from hangar import Repository
    from hangar.records.summarize import list_history

    cmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32)
    for cIdx in range(nCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        sampList = []
        with co.columns['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(nSamples):
                arr = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr
                sampList.append(arr)
        cmt = co.commit(f'commit number: {cIdx}')
        cmtList.append((cmt, sampList))
        co.close()
    masterHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name='master')

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'

    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in cmtList:
        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'writtenaset' in nco.columns
        assert len(nco.columns['writtenaset']) == len(sampList)

        assert nco.columns['writtenaset'].contains_remote_references is True
        remoteKeys = nco.columns['writtenaset'].remote_reference_keys
        assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.columns['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.columns['writtenaset'][sIdx]
        nco.close()
    cloneMasterHist = list_history(newRepo._env.refenv,
                                   newRepo._env.branchenv,
                                   branch_name='master')
    assert cloneMasterHist == masterHist
    newRepo._env._close_environments()
Esempio n. 2
0
    def test_server_fetch_data_sample_not_valid_type(
            self, two_multi_format_repo_class, managed_tmpdir_class, tmp_path_factory
    ):
        from hangar import Repository

        cmt, server_instance = two_multi_format_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

        with pytest.raises(TypeError):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',
                column='array_flat',
                samples=[b'BYTES_TYPE_NOT_VALID'])

        with pytest.raises(ValueError, match='nested column specifier sequence'):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',
                column='array_nested',
                samples=[(0, 1, 'ARRAY_NOT_VALID')])

        newRepo._env._close_environments()
Esempio n. 3
0
def test_push_clone_three_way_merge(server_instance, repo_2_br_no_conf, managed_tmpdir):
    from hangar import Repository

    repo_2_br_no_conf.remote.add('origin', server_instance)
    push1 = repo_2_br_no_conf.remote.push('origin', 'master')
    assert push1 == 'master'
    push2 = repo_2_br_no_conf.remote.push('origin', 'testbranch')
    assert push2 == 'testbranch'

    test_head = repo_2_br_no_conf.log(branch='testbranch', return_contents=True)['head']
    master_head = repo_2_br_no_conf.log(branch='master', return_contents=True)['head']

    merge_cmt = repo_2_br_no_conf.merge('merge commit', 'master', 'testbranch')
    merge_head = repo_2_br_no_conf.log(branch='master', return_contents=True)['head']
    merge_order = repo_2_br_no_conf.log(branch='master', return_contents=True)['order']
    merge_push = repo_2_br_no_conf.remote.push('origin', 'master')
    assert merge_push == 'master'
    assert merge_head != master_head
    assert merge_head != test_head

    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

    clone_head = newRepo.log(branch='master', return_contents=True)['head']
    clone_order = newRepo.log(branch='master', return_contents=True)['order']
    assert clone_head == merge_head == merge_cmt
    assert merge_order == clone_order
    newRepo._env._close_environments()
Esempio n. 4
0
    def test_server_fetch_data_sample_not_existing_fails(
            self, two_multi_format_repo_class, managed_tmpdir_class, tmp_path_factory
    ):
        from hangar import Repository

        cmt, server_instance = two_multi_format_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

        with pytest.raises(KeyError):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',
                column='array_flat',
                samples=['DOESNOTEXIST'])

        with pytest.raises(KeyError):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',
                column='array_nested',
                samples=[(1, 'DOESNOTEXIST')])

        with pytest.raises(KeyError):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',
                column='array_nested',
                samples=[('DOESNOTEXIST', 0)])

        newRepo._env._close_environments()
Esempio n. 5
0
def test_push_clone_digests_exceeding_server_nbyte_limit(
        mocker, server_instance_nbytes_limit, repo, managed_tmpdir):
    from hangar import Repository
    from hangar.remote import chunks, client

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='aset', shape=(50, 50), dtype=np.float32)
    for cIdx in range(4):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.columns['aset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(70):
                arr = np.random.randint(0, 255,
                                        size=(50, 50)).astype(np.float32)
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance_nbytes_limit)

    spy = mocker.spy(chunks, 'tensorChunkedIterator')
    push1 = repo.remote.push('origin', 'master')
    assert chunks.tensorChunkedIterator.call_count == 6
    for call in spy.call_args_list:
        assert call[1][
            'uncomp_nbytes'] <= 550_000  # maximum amount over 100_000 observed in test development

    assert push1 == 'master'

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance_nbytes_limit,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']

    spy = mocker.spy(client.HangarClient, 'fetch_data')
    for cmt, sampList in masterCmtList:
        newRepo.remote.fetch_data('origin', commit=cmt)
        nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'aset' in nco.columns
        assert len(nco.columns['aset']) == 70
        for sIdx, samp in enumerate(sampList):
            assert np.allclose(nco.columns['aset'][str(sIdx)], samp)
        nco.close()
        del nco
    assert client.HangarClient.fetch_data.call_count == 8
    newRepo._env._close_environments()
Esempio n. 6
0
def clone(repo: Repository, remote, name, email, overwrite):
    """Initialize a repository at the current path and fetch updated records from REMOTE.

    Note: This method does not actually download the data to disk. Please look
    into the ``fetch-data`` command.
    """
    if repo.initialized and (not overwrite):
        click.echo(f'Repo already exists at: {repo.path}')
    else:
        repo.clone(name, email, remote, remove_old=overwrite)
Esempio n. 7
0
def clone(remote, uname, email, overwrite):
    if isinstance(uname, (list, tuple)):
        uname = ' '.join(uname)
    P = os.getcwd()
    repo = Repository(path=P)
    repo.clone(user_name=uname,
               user_email=email,
               remote_address=remote,
               remove_old=overwrite)
    click.echo(f'Hangar repository initialized at {P}')
Esempio n. 8
0
 def test_local_without_data_fails_no_common_no_local(self, written_two_cmt_server_repo, managed_tmpdir):
     new_tmpdir = pjoin(managed_tmpdir, 'new')
     mkdir(new_tmpdir)
     server, _ = written_two_cmt_server_repo
     repo = Repository(path=new_tmpdir, exists=False)
     repo.clone('name', '[email protected]', server, remove_old=True)
     co = repo.checkout()
     aset = co.arraysets['writtenaset']
     with pytest.raises(ValueError):
         torch_dset = make_torch_dataset(aset)
     co.close()
     repo._env._close_environments()
Esempio n. 9
0
 def test_local_without_data_fails_data_unavailable(self, written_two_cmt_server_repo, managed_tmpdir):
     new_tmpdir = pjoin(managed_tmpdir, 'new')
     mkdir(new_tmpdir)
     server, _ = written_two_cmt_server_repo
     repo = Repository(path=new_tmpdir, exists=False)
     repo.clone('name', '[email protected]', server, remove_old=True)
     co = repo.checkout()
     aset = co.arraysets['writtenaset']
     with pytest.raises(FileNotFoundError):
         tf_dset = make_tf_dataset(aset, keys=['1', '2'])
     co.close()
     repo._env._close_environments()
Esempio n. 10
0
def clone(ctx, remote, name, email, overwrite):
    """Initialize a repository at the current path and fetch updated records from REMOTE.

    Note: This method does not actually download the data to disk. Please look
    into the ``fetch-data`` command.
    """
    P = os.getcwd()
    repo = Repository(path=P, exists=False)
    repo.clone(user_name=name,
               user_email=email,
               remote_address=remote,
               remove_old=overwrite)
    click.echo(f'Hangar repository initialized at {P}')
Esempio n. 11
0
def test_push_clone_digests_exceeding_server_nbyte_limit(
        server_instance, repo, managed_tmpdir):
    from hangar.remote import config
    from hangar import Repository

    config.config['server']['grpc']['fetch_max_nbytes'] = 100_000
    config.config['client']['grpc']['push_max_nbytes'] = 100_000

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.arraysets.init_arrayset(name='aset', shape=(50, 20), dtype=np.float32)
    for cIdx in range(4):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.arraysets['aset'] as d:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
            for sIdx in range(70):
                arr = np.random.randn(50, 20).astype(np.float32)
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        newRepo.remote.fetch_data('origin', commit=cmt)
        nco = newRepo.checkout(commit=cmt)
        assert len(nco.arraysets) == 1
        assert 'aset' in nco.arraysets
        assert len(nco.arraysets['aset']) == 70
        for sIdx, samp in enumerate(sampList):
            assert np.allclose(nco.arraysets['aset'][str(sIdx)], samp)
        nco.close()
    newRepo._env._close_environments()
Esempio n. 12
0
def test_push_restricted_with_right_username_password(
        server_instance_push_restricted, repo, managed_tmpdir):
    from hangar import Repository

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='aset', shape=(50, 20), dtype=np.float32)
    for cIdx in range(1):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.columns['aset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(70):
                arr = np.random.randn(50, 20).astype(np.float32)
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance_push_restricted)
    push1 = repo.remote.push('origin',
                             'master',
                             username='******',
                             password='******')
    assert push1 == 'master'

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance_push_restricted,
                  remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        newRepo.remote.fetch_data('origin', commit=cmt)
        nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'aset' in nco.columns
        assert len(nco.columns['aset']) == 70
        for sIdx, samp in enumerate(sampList):
            assert np.allclose(nco.columns['aset'][str(sIdx)], samp)
        nco.close()
    newRepo._env._close_environments()
Esempio n. 13
0
    def test_server_fetch_data_sample(
            self, two_multi_format_repo_class, managed_tmpdir_class,
            fetchOp, column_name, keys, tmp_path_factory
    ):
        from hangar import Repository

        cmt, server_instance = two_multi_format_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

        # ------------------ format arguments depending on options -----------------

        kwargs = {
            'column': column_name,
            'samples': keys
        }
        if fetchOp == 'branch':
            kwargs['branch'] = 'master'
        elif fetchOp == 'commit':
            kwargs['commit'] = cmt
        else:
            raise ValueError(f'fetchOp unknown: {fetchOp}')

        fetch_commit = newRepo.remote.fetch_data_sample(remote='origin', **kwargs)
        assert fetch_commit == cmt

        co = newRepo.checkout()
        try:
            col = co[column_name]
            if isinstance(keys, (list, tuple)):
                if column_name.endswith('flat'):
                    for key in keys:
                        assert col[key] is not None
                else:
                    for sample in keys:
                        if isinstance(sample, (list, tuple)):
                            if len(sample) == 2:
                                assert col[sample[0]][sample[1]] is not None
                            elif len(sample) == 1:
                                assert col[sample[0]][...] is not None
                        else:
                            assert col[sample][...] is not None
        finally:
            co.close()
            newRepo._env._close_environments()
Esempio n. 14
0
    def test_server_fetch_data_sample_branch_and_commit_args_passed_fails(
            self, two_multi_format_repo_class, managed_tmpdir_class, tmp_path_factory
    ):
        from hangar import Repository

        cmt, server_instance = two_multi_format_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)

        with pytest.raises(ValueError, match='``branch`` and ``commit``'):
            newRepo.remote.fetch_data_sample(
                remote='origin',
                branch='master',  # actual value which might otherwise work
                commit=cmt,  # actual value which might otherwise work
                column='array_flat',
                samples=[0, 1])

        newRepo._env._close_environments()
Esempio n. 15
0
    def test_server_push_two_branch_then_clone_fetch_data_options(
            self, two_branch_multi_commit_repo_class, managed_tmpdir_class, array5by7_class,
            fetchBranch, fetchCommit, fetchAsetns, fetchNbytes, fetchAll_history, tmp_path_factory):
        from hangar import Repository
        from operator import eq

        branch, branchHist, devCmts, masterHist, server_instance = two_branch_multi_commit_repo_class

        # Clone test (master branch)
        _new_tmpdir = tmp_path_factory.mktemp('newclone', numbered=True)
        new_tmpdir = str(_new_tmpdir)
        newRepo = Repository(path=new_tmpdir, exists=False)
        newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)
        newRepo.remote.fetch('origin', branch=branch.name)
        newRepo.create_branch('testbranch', base_commit=branchHist['head'])
        assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}', branch.name]

        # ------------------ format arguments depending on options -----------------

        kwargs = {
            'column_names': fetchAsetns,
            'max_num_bytes': fetchNbytes,
            'retrieve_all_history': fetchAll_history,
        }
        if fetchBranch is not None:
            func = branchHist if fetchBranch == 'testbranch' else masterHist
            kwargs['branch'] = fetchBranch
            kwargs['commit'] = None
        else:
            func = branchHist if fetchBranch == 'br' else masterHist
            kwargs['branch'] = None
            kwargs['commit'] = func['head']

        if fetchAll_history is True:
            commits_to_check = func['order']
        else:
            commits_to_check = [func['head']]

        # ----------------------- retrieve data with desired options --------------

        # This case should fail
        if (fetchAll_history is True) and isinstance(fetchNbytes, int):
            try:
                with pytest.raises(ValueError):
                    fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
            finally:
                newRepo._env._close_environments()
            return True
        # get data
        fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
        assert commits_to_check == fetch_commits

        # ------------- check that you got everything you expected ----------------

        for fCmt in fetch_commits:
            co = newRepo.checkout(commit=fCmt)
            assert co.commit_hash == fCmt

            # when we are checking one aset only
            if isinstance(fetchAsetns, tuple):
                d = co.columns[fetchAsetns[0]]
                # ensure we didn't fetch the other data simultaneously

                ds1SampList, ds2SampList, ds3SampList, ds4SampList = devCmts[fCmt]
                if fetchAsetns[0] == 'writtenaset':
                    compare = ds1SampList
                    cmp_func = np.allclose
                elif fetchAsetns[0] == '_two':
                    compare = ds2SampList
                    cmp_func = np.allclose
                elif fetchAsetns[0] == 'str_col':
                    compare = ds3SampList
                    cmp_func = eq
                else:
                    compare = ds4SampList
                    cmp_func = eq

                totalSeen = 0
                for idx, samp in enumerate(compare):
                    if fetchNbytes is None:
                        assert cmp_func(samp, d[str(idx)])
                    else:
                        try:
                            arr = d[str(idx)]
                            assert cmp_func(samp, arr)
                            try:
                                totalSeen += arr.nbytes
                            except AttributeError:
                                totalSeen += len(arr)
                        except FileNotFoundError:
                            pass
                        assert totalSeen <= fetchNbytes

            # compare both asets at the same time
            else:
                d = co.columns['writtenaset']
                dd = co.columns['_two']
                str_col = co.columns['str_col']
                bytes_col = co.columns['bytes_col']
                ds1List, ds2List, ds3List, ds4List = devCmts[fCmt]
                totalSeen = 0
                for idx, ds1ds2ds3ds4 in enumerate(zip(ds1List, ds2List, ds3List, ds4List)):
                    ds1, ds2, ds3, ds4 = ds1ds2ds3ds4
                    if fetchNbytes is None:
                        assert np.allclose(ds1, d[str(idx)])
                        assert np.allclose(ds2, dd[str(idx)])
                        assert ds3 == str_col[str(idx)]
                        assert ds4 == bytes_col[str(idx)]
                    else:
                        try:
                            arr1 = d[str(idx)]
                            assert np.allclose(ds1, arr1)
                            totalSeen += arr1.nbytes
                        except FileNotFoundError:
                            pass
                        try:
                            arr2 = dd[str(idx)]
                            assert np.allclose(ds2, arr2)
                            totalSeen += arr2.nbytes
                        except FileNotFoundError:
                            pass
                        try:
                            sval = str_col[str(idx)]
                            assert ds3 == sval
                            totalSeen += len(sval.encode())
                        except FileNotFoundError:
                            pass
                        try:
                            bval = bytes_col[str(idx)]
                            assert ds4 == bval
                            totalSeen += len(bval)
                        except FileNotFoundError:
                            pass
                        assert totalSeen <= fetchNbytes
            co.close()
        newRepo._env._close_environments()
Esempio n. 16
0
def test_server_push_two_branch_then_clone_fetch_data_options(
        server_instance, repo, managed_tmpdir, array5by7, nMasterCommits,
        nMasterSamples, nDevCommits, nDevSamples, fetchBranch, fetchCommit,
        fetchAsetns, fetchNbytes, fetchAll_history):
    from hangar import Repository
    from hangar.records.summarize import list_history

    # Push master branch test
    masterCmts = {}
    co = repo.checkout(write=True)
    co.arraysets.init_arrayset(name='writtenaset',
                               shape=(5, 7),
                               dtype=np.float32)
    co.arraysets.init_arrayset(name='_two', shape=(20), dtype=np.float32)
    for cIdx in range(nMasterCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList1 = []
        masterSampList2 = []
        with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
                dd.remove(prevKey)

            for sIdx in range(nMasterSamples):
                arr1 = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr1
                masterSampList1.append(arr1)
                arr2 = np.random.randn(20).astype(np.float32)
                dd[str(sIdx)] = arr2
                masterSampList2.append(arr2)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmts[cmt] = (masterSampList1, masterSampList2)
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'
    masterHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name='master')

    # Push dev branch test
    devCmts = masterCmts.copy()
    branch = repo.create_branch('testbranch')
    for cIdx in range(nDevCommits):
        co = repo.checkout(write=True, branch=branch.name)
        devSampList1 = []
        devSampList2 = []
        with co.arraysets['writtenaset'] as d, co.arraysets['_two'] as dd:
            for prevKey in list(d.keys())[1:]:
                d.remove(prevKey)
                dd.remove(prevKey)

            for sIdx in range(nDevSamples):
                arr1 = np.random.randn(*array5by7.shape).astype(
                    np.float32) * 100
                d[str(sIdx)] = arr1
                devSampList1.append(arr1)
                arr2 = np.random.randn(20).astype(np.float32)
                dd[str(sIdx)] = arr2
                devSampList2.append(arr2)
        cmt = co.commit(f'dev commit number: {cIdx}')
        devCmts[cmt] = (devSampList1, devSampList2)
        co.close()

    push2 = repo.remote.push('origin', branch.name)
    assert push2 == branch.name
    branchHist = list_history(repo._env.refenv,
                              repo._env.branchenv,
                              branch_name=branch.name)

    # -------------------------- end setup ------------------------------------

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User',
                  '*****@*****.**',
                  server_instance,
                  remove_old=True)
    newRepo.remote.fetch('origin', branch=branch.name)
    newRepo.create_branch('testbranch', base_commit=branchHist['head'])
    assert newRepo.list_branches() == [
        'master', 'origin/master', f'origin/{branch.name}', branch.name
    ]

    # ------------------ format arguments dependingon options -----------------

    kwargs = {
        'arrayset_names': fetchAsetns,
        'max_num_bytes': fetchNbytes,
        'retrieve_all_history': fetchAll_history,
    }
    if fetchBranch is not None:
        func = branchHist if fetchBranch == 'testbranch' else masterHist
        kwargs['branch'] = fetchBranch
        kwargs['commit'] = None
    else:
        func = branchHist if fetchBranch == 'br' else masterHist
        kwargs['branch'] = None
        kwargs['commit'] = func['head']

    if fetchAll_history is True:
        commits_to_check = func['order']
    else:
        commits_to_check = [func['head']]

    # ----------------------- retrieve data with desired options --------------

    # This case should fail
    if (fetchAll_history is True) and isinstance(fetchNbytes, int):
        try:
            with pytest.raises(ValueError):
                fetch_commits = newRepo.remote.fetch_data(remote='origin',
                                                          **kwargs)
        finally:
            newRepo._env._close_environments()
        return True
    # get data
    fetch_commits = newRepo.remote.fetch_data(remote='origin', **kwargs)
    assert commits_to_check == fetch_commits

    # ------------- check that you got everything you expected ----------------

    for fCmt in fetch_commits:
        co = newRepo.checkout(commit=fCmt)
        assert co.commit_hash == fCmt

        # when we are checking one aset only
        if isinstance(fetchAsetns, tuple):
            d = co.arraysets[fetchAsetns[0]]
            # ensure we didn't fetch the other data simultaneously

            ds1SampList, ds2SampList = devCmts[fCmt]
            if fetchAsetns[0] == 'writtenaset':
                compare = ds1SampList
            else:
                compare = ds2SampList

            totalSeen = 0
            for idx, samp in enumerate(compare):
                if fetchNbytes is None:
                    assert np.allclose(samp, d[str(idx)])
                else:
                    try:
                        arr = d[str(idx)]
                        assert np.allclose(samp, arr)
                        totalSeen += arr.nbytes
                    except FileNotFoundError:
                        pass
                    assert totalSeen <= fetchNbytes

        # compare both asets at the same time
        else:
            d = co.arraysets['writtenaset']
            dd = co.arraysets['_two']
            ds1List, ds2List = devCmts[fCmt]
            totalSeen = 0
            for idx, ds1ds2 in enumerate(zip(ds1List, ds2List)):
                ds1, ds2 = ds1ds2
                if fetchNbytes is None:
                    assert np.allclose(ds1, d[str(idx)])
                    assert np.allclose(ds2, dd[str(idx)])
                else:
                    try:
                        arr1 = d[str(idx)]
                        assert np.allclose(ds1, arr1)
                        totalSeen += arr1.nbytes
                    except FileNotFoundError:
                        pass
                    try:
                        arr2 = dd[str(idx)]
                        assert np.allclose(ds2, arr2)
                        totalSeen += arr2.nbytes
                    except FileNotFoundError:
                        pass
                    assert totalSeen <= fetchNbytes
        co.close()
    newRepo._env._close_environments()
Esempio n. 17
0
def test_server_push_second_branch_with_new_commit_then_clone_partial_fetch(
        server_instance, repo, managed_tmpdir, array5by7, nMasterCommits,
        nMasterSamples, nDevCommits, nDevSamples):
    from hangar import Repository
    from hangar.records.summarize import list_history

    # Push master branch test
    masterCmtList = []
    co = repo.checkout(write=True)
    co.add_ndarray_column(name='writtenaset', shape=(5, 7), dtype=np.float32)
    for cIdx in range(nMasterCommits):
        if cIdx != 0:
            co = repo.checkout(write=True)
        masterSampList = []
        with co.columns['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(nMasterSamples):
                arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100
                d[str(sIdx)] = arr
                masterSampList.append(arr)
        cmt = co.commit(f'master commit number: {cIdx}')
        masterCmtList.append((cmt, masterSampList))
        co.close()

    repo.remote.add('origin', server_instance)
    push1 = repo.remote.push('origin', 'master')
    assert push1 == 'master'
    masterHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name='master')

    # Push dev branch test
    devCmtList = []
    branch = repo.create_branch('testbranch')
    for cIdx in range(nDevCommits):
        co = repo.checkout(write=True, branch=branch.name)
        devSampList = []
        with co.columns['writtenaset'] as d:
            for prevKey in list(d.keys())[1:]:
                del d[prevKey]
            for sIdx in range(nDevSamples):
                arr = np.random.randn(*array5by7.shape).astype(np.float32) * 100
                d[str(sIdx)] = arr
                devSampList.append(arr)
        cmt = co.commit(f'dev commit number: {cIdx}')
        devCmtList.append((cmt, devSampList))
        co.close()

    push2 = repo.remote.push('origin', branch.name)
    assert push2 == branch.name
    branchHist = list_history(repo._env.refenv, repo._env.branchenv, branch_name=branch.name)

    # Clone test (master branch)
    new_tmpdir = pjoin(managed_tmpdir, 'new')
    mkdir(new_tmpdir)
    newRepo = Repository(path=new_tmpdir, exists=False)
    newRepo.clone('Test User', '*****@*****.**', server_instance, remove_old=True)
    assert newRepo.list_branches() == ['master', 'origin/master']
    for cmt, sampList in masterCmtList:
        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'writtenaset' in nco.columns
        assert len(nco.columns['writtenaset']) == nMasterSamples

        assert nco.columns['writtenaset'].contains_remote_references is True
        remoteKeys = nco.columns['writtenaset'].remote_reference_keys
        assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.columns['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.columns['writtenaset'][sIdx]
        nco.close()
    cloneMasterHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name='master')
    assert cloneMasterHist == masterHist

    # Fetch test
    fetch = newRepo.remote.fetch('origin', branch=branch.name)
    assert fetch == f'origin/{branch.name}'
    assert newRepo.list_branches() == ['master', 'origin/master', f'origin/{branch.name}']
    for cmt, sampList in devCmtList:

        with pytest.warns(UserWarning):
            nco = newRepo.checkout(commit=cmt)
        assert len(nco.columns) == 1
        assert 'writtenaset' in nco.columns
        assert len(nco.columns['writtenaset']) == nDevSamples

        assert nco.columns['writtenaset'].contains_remote_references is True
        remoteKeys = nco.columns['writtenaset'].remote_reference_keys
        assert tuple([str(idx) for idx in range(len(sampList))]) == remoteKeys
        for idx, _ in enumerate(sampList):
            sIdx = str(idx)
            assert sIdx in nco.columns['writtenaset']
            with pytest.raises(FileNotFoundError):
                shouldNotExist = nco.columns['writtenaset'][sIdx]
        nco.close()

    cloneBranchHist = list_history(newRepo._env.refenv, newRepo._env.branchenv, branch_name=f'origin/{branch.name}')
    assert cloneBranchHist == branchHist
    newRepo._env._close_environments()