def test_publish_recursive(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: with assert_raises(ValueError) as cm: publish(dataset=source, to="target", recursive=True) assert_in("No sibling 'target' found.", str(cm.exception)) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in('forced update', cml.out, "we probably haven't merged git-annex before pushing") # testing result list # (Note: Dataset lacks __eq__ for now. Should this be based on path only?) assert_is_instance(res, tuple) assert_is_instance(res[0], list) assert_is_instance(res[1], list) eq_(res[1], []) # nothing failed/was skipped for item in res[0]: assert_is_instance(item, Dataset) eq_({res[0][0].path, res[0][1].path, res[0][2].path}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, only current pushed res_ = publish(dataset=source, recursive=True) # only current one would get pushed eq_(set(r.path for r in res_[0]), {src_path}) # all get pushed res_ = publish(dataset=source, recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), {src_path, sub1.path, sub2.path}) # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') sub2.add('file.txt') sub2.commit("") # TODO: Doesn't work: https://github.com/datalad/datalad/issues/636 #source.save("changed sub2", auto_add_changes=True) source.repo.commit("", options=['-a']) res_ = publish(dataset=source, recursive=True) # only updated ones were published eq_(set(r.path for r in res_[0]), {src_path, sub2.path})
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) # first it would publish data and then push eq_(res, (['test-annex.dat', source], [])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install( dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) eq_(res, ([], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: result_paths.append(item.path if isinstance(item, Dataset) else item) # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before eq_({sub1.path, sub2.path}, set(result_paths)) # if we publish again -- nothing to be published eq_(source.publish(to="target"), ([], [])) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') eq_(source.publish(to="target"), ([source], [])) eq_(source.publish(to="target"), ([], [])) # and empty again if we try again
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) eq_(res, ([source, 'test-annex.dat'], [])) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: eq_( list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") eq_(target.file_has_content(['test-annex.dat']), [True]) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) eq_(res, ([source, 'test-annex.dat'], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: if isinstance(item, Dataset): result_paths.append(item.path) else: result_paths.append(item) eq_( { source.path, opj(source.path, "subm 1"), opj(source.path, "subm 2"), 'test-annex.dat' }, set(result_paths))
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_( list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install(dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path)
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: # yoh: they differ because local annex records information about now # file being available in that remote, and remote one does it via a call in # the hook I guess. So they both get the same information but in two # different commits. I do not observe such behavior of remote having git-annex # automagically updated in older clones # which do not have post-receive hook on remote side eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install( dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path)
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub, dst_clone_path): # prepare src source = install(src_path, source=origin, recursive=True) source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: # the need to be annexes, because we want to be able to copy data to them # further down sub1_target = AnnexRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, '2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) res = publish(dataset=source, to="target", path=['test-annex.dat'], result_xfm='paths') # first it would publish data and then push # TODO order is not fixed (yet) #eq_(res, [opj(source.path, 'test-annex.dat'), source.path]) eq_(set(res), set([opj(source.path, 'test-annex.dat'), source.path])) # XXX master was not checked out in dst! eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(source.repo.get_branch_commits_(DEFAULT_BRANCH))) assert_git_annex_branch_published(source.repo, target) # we need compare target/<default branch>: target.checkout(DEFAULT_BRANCH) ok_(target.file_has_content('test-annex.dat')) # make sure that whatever we published is actually consumable dst_clone = install(dst_clone_path, source=dst_path, result_xfm='datasets', return_type='item-or-list') nok_(dst_clone.repo.file_has_content('test-annex.dat')) res = dst_clone.get('test-annex.dat') ok_(dst_clone.repo.file_has_content('test-annex.dat')) res = publish(dataset=source, to="target", path=['.']) # there is nothing to publish on 2nd attempt #eq_(res, ([source, 'test-annex.dat'], [])) assert_result_count(res, 1, status='notneeded') import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # only the subdatasets, targets are plain git repos, hence # no file content is pushed, all content in super was pushed # before assert_result_count(res, 3) assert_result_count(res, 1, status='ok', path=sub1.path) assert_result_count(res, 1, status='ok', path=sub2.path) assert_result_count(res, 1, status='notneeded', path=source.path) # if we publish again -- nothing to be published res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # if we drop a file and publish again -- dataset should be published # since git-annex branch was updated source.drop('test-annex.dat') res = source.publish(to="target") assert_result_count(res, 1, status='ok', path=source.path) # and empty again if we try again res = source.publish(to="target") assert_result_count(res, 1, status='notneeded', path=source.path) # data integrity check looks identical from all perspectives # minus "note" statements from git-annex eq_(filter_fsck_error_msg(source.repo.fsck()), filter_fsck_error_msg(source.repo.fsck(remote='target'))) eq_(filter_fsck_error_msg(target.fsck()), filter_fsck_error_msg(source.repo.fsck(remote='target')))
def test_publish_recursive(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] # create plain git at target: target = GitRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # subdatasets have no remote yet, so recursive publishing should fail: with assert_raises(ValueError) as cm: publish(dataset=source, to="target", recursive=True) assert_in("No sibling 'target' found", exc_str(cm.exception)) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = AnnexRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # publish recursively with swallow_logs(new_level=logging.DEBUG) as cml: res = publish(dataset=source, to="target", recursive=True) assert_not_in( 'forced update', cml.out, "we probably haven't merged git-annex before pushing" ) # testing result list # (Note: Dataset lacks __eq__ for now. Should this be based on path only?) assert_is_instance(res, tuple) assert_is_instance(res[0], list) assert_is_instance(res[1], list) eq_(res[1], []) # nothing failed/was skipped for item in res[0]: assert_is_instance(item, Dataset) eq_({res[0][0].path, res[0][1].path, res[0][2].path}, {src_path, sub1.path, sub2.path}) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) eq_(list(target.get_branch_commits("git-annex")), list(source.repo.get_branch_commits("git-annex"))) eq_(list(sub1_target.get_branch_commits("master")), list(sub1.get_branch_commits("master"))) eq_(list(sub1_target.get_branch_commits("git-annex")), list(sub1.get_branch_commits("git-annex"))) eq_(list(sub2_target.get_branch_commits("master")), list(sub2.get_branch_commits("master"))) eq_(list(sub2_target.get_branch_commits("git-annex")), list(sub2.get_branch_commits("git-annex"))) # test for publishing with --since. By default since no changes, only current pushed res_ = publish(dataset=source, recursive=True) # only current one would get pushed eq_(set(r.path for r in res_[0]), {src_path}) # all get pushed res_ = publish(dataset=source, recursive=True, since='HEAD^') eq_(set(r.path for r in res_[0]), {src_path, sub1.path, sub2.path}) # Let's now update one subm with open(opj(sub2.path, "file.txt"), 'w') as f: f.write('') sub2.add('file.txt') sub2.commit("") # TODO: Doesn't work: https://github.com/datalad/datalad/issues/636 #source.save("changed sub2", all_changes=True) source.repo.commit("", options=['-a']) res_ = publish(dataset=source, recursive=True) # only updated ones were published eq_(set(r.path for r in res_[0]), {src_path, sub2.path})
def test_publish_with_data(origin, src_path, dst_path, sub1_pub, sub2_pub): # prepare src source = install(src_path, source=origin, recursive=True)[0] source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) target.checkout("TMP", ["-b"]) source.repo.add_remote("target", dst_path) # now, set up targets for the submodules: sub1_target = GitRepo(sub1_pub, create=True) sub1_target.checkout("TMP", ["-b"]) sub2_target = GitRepo(sub2_pub, create=True) sub2_target.checkout("TMP", ["-b"]) sub1 = GitRepo(opj(src_path, 'subm 1'), create=False) sub2 = GitRepo(opj(src_path, 'subm 2'), create=False) sub1.add_remote("target", sub1_pub) sub2.add_remote("target", sub2_pub) # TMP: Insert the fetch to prevent GitPython to fail after the push, # because it cannot resolve the SHA of the old commit of the remote, # that git reports back after the push. # TODO: Figure out, when to fetch things in general; Alternatively: # Is there an option for push, that prevents GitPython from failing? source.repo.fetch("target") res = publish(dataset=source, to="target", path=['test-annex.dat']) eq_(res, ([source, 'test-annex.dat'], [])) eq_(list(target.get_branch_commits("master")), list(source.repo.get_branch_commits("master"))) # TODO: last commit in git-annex branch differs. Probably fine, # but figure out, when exactly to expect this for proper testing: eq_(list(target.get_branch_commits("git-annex"))[1:], list(source.repo.get_branch_commits("git-annex"))[1:]) # we need compare target/master: target.checkout("master") eq_(target.file_has_content(['test-annex.dat']), [True]) source.repo.fetch("target") res = publish(dataset=source, to="target", path=['.']) eq_(res, ([source, 'test-annex.dat'], [])) source.repo.fetch("target") import glob res = publish(dataset=source, to="target", path=glob.glob1(source.path, '*')) # Note: This leads to recursive publishing, since expansion of '*' # contains the submodules themselves in this setup # collect result paths: result_paths = [] for item in res[0]: if isinstance(item, Dataset): result_paths.append(item.path) else: result_paths.append(item) eq_({source.path, opj(source.path, "subm 1"), opj(source.path, "subm 2"), 'test-annex.dat'}, set(result_paths))