def test_get_values(): data = {'x': 'y', 'g': 'h', 'a': 'b'} keys = ['x', 'a'] gen = Sink() list(gen(data)) eq_(gen.get_values(keys), [['y', 'b']])
def test_listdict2dictlist(): f = _listdict2dictlist l1 = [1, 3, [1, 'a']] assert f(l1) is l1, "we return it as is if no emb dict" eq_(f([{1: 2}]), {1: 2}) # inside out no need for a list # inside out, join into the list, skip entry with a list, or space eq_(f([{1: [2, 3], 'a': 1}, {'a': 2, 'c': ''}]), {'a': [1, 2]})
def test_add_readme(path): ds = Dataset(path).create(force=True) ds.save() ds.aggregate_metadata() ok_clean_git(ds.path) assert_status('ok', ds.add_readme()) # should use default name eq_( open(opj(path, 'README.md')).read(), """\ # Dataset "demo_ds" this is for play ### Authors - Betty - Tom ### License PDDL ## General information This is a DataLad dataset (id: {id}). For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( id=ds.id)) # should skip on re-run assert_status('notneeded', ds.add_readme())
def test_get_recurse_dirs(o_path, c_path): # prepare source: origin = Dataset(o_path).create(force=True) origin.add('.') ds = install( c_path, source=o_path, result_xfm='datasets', return_type='item-or-list') file_list = ['file1.txt', opj('subdir', 'file2.txt'), opj('subdir', 'subsubdir', 'file3.txt'), opj('subdir', 'subsubdir', 'file4.txt')] files_in_sub = [f for f in file_list if f.startswith(with_pathsep('subdir'))] # no content present: ok_(not any(ds.repo.file_has_content(file_list))) result = ds.get('subdir') # check result: assert_status('ok', result) eq_(set([item.get('path')[len(ds.path) + 1:] for item in result if item['type'] == 'file']), set(files_in_sub)) # we also get one report on the subdir eq_(len(result) - 1, len(files_in_sub)) # got all files beneath subdir: ok_(all(ds.repo.file_has_content(files_in_sub))) # additionally got file1.txt silently, since it has the same content as # subdir/subsubdir/file4.txt: ok_(ds.repo.file_has_content('file1.txt') is True)
def test_runnin_on_empty(path): # empty repo repo = AnnexRepo(path, create=True) # just wrap with a dataset ds = Dataset(path) # and run status ... should be good and do nothing eq_([], ds.status())
def test_install_skip_list_arguments(src, path, path_outside): ds = install(path, source=src) ok_(ds.is_installed()) # install a list with valid and invalid items: result = ds.install( path=['subm 1', 'not_existing', path_outside, '2'], get_data=False, on_failure='ignore', result_xfm=None, return_type='list') # good and bad results together ok_(isinstance(result, list)) eq_(len(result), 4) # check that we have an 'impossible' status for both invalid args # but all the other tasks have been accomplished for skipped, msg in [(opj(ds.path, 'not_existing'), "path does not exist"), (path_outside, "path not associated with any dataset")]: assert_result_count( result, 1, status='impossible', message=msg, path=skipped) for sub in [Dataset(opj(path, 'subm 1')), Dataset(opj(path, '2'))]: assert_result_count( result, 1, status='ok', message=('Installed subdataset in order to get %s', sub.path)) ok_(sub.is_installed()) # return of get is always a list, by default, even if just one thing was gotten # in this case 'subm1' was already obtained above, so this will get this # content of the subdataset with assert_raises(IncompleteResultsError) as cme: ds.install(path=['subm 1', 'not_existing']) with assert_raises(IncompleteResultsError) as cme: ds.get(path=['subm 1', 'not_existing'])
def test_addurls_subdataset(self, path): ds = Dataset(path).create(force=True) with chpwd(path): for save in True, False: label = "save" if save else "nosave" hexsha_before = ds.repo.get_hexsha() ds.addurls(self.json_file, "{url}", "{subdir}-" + label + "//{name}", save=save) hexsha_after = ds.repo.get_hexsha() for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]: ok_exists(fname.format(label)) assert_true(save ^ (hexsha_before == hexsha_after)) assert_true(save ^ ds.repo.dirty) # Now save the "--nosave" changes and check that we have # all the subdatasets. ds.add(".") eq_(set(subdatasets(ds, recursive=True, result_xfm="relpaths")), {"foo-save", "bar-save", "foo-nosave", "bar-nosave"}) # We don't try to recreate existing subdatasets. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}") assert_in("Not creating subdataset at existing path", cml.out)
def test_install_known_subdataset(src, path): # get the superdataset: ds = install(path, source=src) # subdataset not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths')) # install it: ds.install('subm 1') ok_(subds.is_installed()) ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False)) # Verify that it is the correct submodule installed and not # new repository initiated eq_(set(subds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths')) # now, get the data by reinstalling with -g: ok_(subds.repo.file_has_content('test-annex.dat') is False) with chpwd(ds.path): result = get(path='subm 1', dataset=os.curdir) assert_in_results(result, path=opj(subds.path, 'test-annex.dat')) ok_(subds.repo.file_has_content('test-annex.dat') is True) ok_(subds.is_installed())
def test_install_list(path, top_path): # we want to be able to install several things, if these are known # (no 'source' allowed). Therefore first toplevel: ds = install(top_path, source=path, recursive=False) assert_not_in('annex.hardlink', ds.config) ok_(ds.is_installed()) sub1 = Dataset(opj(top_path, 'subm 1')) sub2 = Dataset(opj(top_path, '2')) ok_(not sub1.is_installed()) ok_(not sub2.is_installed()) # fails, when `source` is passed: assert_raises(ValueError, ds.install, path=['subm 1', '2'], source='something') # now should work: result = ds.install(path=['subm 1', '2'], result_xfm='paths') ok_(sub1.is_installed()) ok_(sub2.is_installed()) eq_(set(result), {sub1.path, sub2.path}) # and if we request it again via get, result should be empty get_result = ds.get(path=['subm 1', '2'], get_data=False) assert_status('notneeded', get_result)
def test_install_simple_local(src, path): origin = Dataset(path) # now install it somewhere else ds = install(path, source=src, description='mydummy') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'}) ok_clean_git(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) ok_clean_git(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = install(path, source=src, result_xfm=None, return_type='list') assert_status('notneeded', res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid)
def test_install_into_dataset(source, top_path): ds = create(top_path) ok_clean_git(ds.path) subds = ds.install("sub", source=source, save=False) if isinstance(subds.repo, AnnexRepo) and subds.repo.is_direct_mode(): ok_(exists(opj(subds.path, '.git'))) else: ok_(isdir(opj(subds.path, '.git'))) ok_(subds.is_installed()) assert_in('sub', ds.subdatasets(result_xfm='relpaths')) # sub is clean: ok_clean_git(subds.path, annex=None) # top is too: ok_clean_git(ds.path, annex=None) ds.save('addsub') # now it is: ok_clean_git(ds.path, annex=None) # but we could also save while installing and there should be no side-effect # of saving any other changes if we state to not auto-save changes # Create a dummy change create_tree(ds.path, {'dummy.txt': 'buga'}) ok_clean_git(ds.path, untracked=['dummy.txt']) subds_ = ds.install("sub2", source=source) eq_(subds_.path, opj(ds.path, "sub2")) # for paranoid yoh ;) ok_clean_git(ds.path, untracked=['dummy.txt']) # and we should achieve the same behavior if we create a dataset # and then decide to add it create(_path_(top_path, 'sub3')) ok_clean_git(ds.path, untracked=['dummy.txt', 'sub3/']) ds.add('sub3') ok_clean_git(ds.path, untracked=['dummy.txt'])
def test_failed_install_multiple(top_path): ds = create(top_path) create(_path_(top_path, 'ds1')) create(_path_(top_path, 'ds3')) ok_clean_git(ds.path, annex=False, untracked=['ds1/', 'ds3/']) # specify install with multiple paths and one non-existing with assert_raises(IncompleteResultsError) as cme: ds.install(['ds1', 'ds2', '///crcns', '///nonexisting', 'ds3']) # install doesn't add existing submodules -- add does that ok_clean_git(ds.path, annex=False, untracked=['ds1/', 'ds3/']) ds.add(['ds1', 'ds3']) ok_clean_git(ds.path, annex=False) # those which succeeded should be saved now eq_(ds.get_subdatasets(), ['crcns', 'ds1', 'ds3']) # and those which didn't -- listed eq_(set(cme.exception.failed), {'///nonexisting', _path_(top_path, 'ds2')}) # but if there was only a single installation requested -- it will be # InstallFailedError to stay consistent with single install behavior # TODO: unify at some point with assert_raises(InstallFailedError) as cme: ds.install('ds2') with assert_raises(InstallFailedError) as cme: ds.install('///nonexisting')
def test_ssh_open_close(tfile1): manager = SSHManager() path = opj(manager.socket_dir, get_connection_hash('localhost')) # TODO: facilitate the test when it didn't exist existed_before = exists(path) print("%s existed: %s" % (path, existed_before)) c1 = manager.get_connection('ssh://localhost') c1.open() # control master exists for sure now ok_(exists(path)) # use connection to execute remote command: local_home = os.path.expanduser('~') # we list explicitly local HOME since we override it in module_setup out, err = c1('ls -a %r' % local_home) remote_ls = [entry for entry in out.splitlines() if entry != '.' and entry != '..'] local_ls = os.listdir(local_home) eq_(set(remote_ls), set(local_ls)) # now test for arguments containing spaces and other pleasant symbols out, err = c1('ls -l {}'.format(sh_quote(tfile1))) assert_in(tfile1, out) eq_(err, '') c1.close() # control master doesn't exist anymore: ok_(exists(path) == existed_before)
def test_ssh_manager_close(): manager = SSHManager() # check for previously existing sockets: existed_before_1 = exists(opj(manager.socket_dir, 'localhost')) existed_before_2 = exists(opj(manager.socket_dir, 'datalad-test')) manager.get_connection('ssh://localhost').open() manager.get_connection('ssh://datalad-test').open() if existed_before_1 and existed_before_2: # we need one connection to be closed and therefore being opened # by `manager` manager.get_connection('ssh://localhost').close() manager.get_connection('ssh://localhost').open() ok_(exists(opj(manager.socket_dir, get_connection_hash('localhost')))) ok_(exists(opj(manager.socket_dir, get_connection_hash('datalad-test')))) manager.close() still_exists_1 = exists(opj(manager.socket_dir, 'localhost')) still_exists_2 = exists(opj(manager.socket_dir, 'datalad-test')) eq_(existed_before_1, still_exists_1) eq_(existed_before_2, still_exists_2)
def test_get_disposition_filename(): input = {'url': 'http://human.brain-map.org/api/v2/well_known_file_download/157722290'} with patch('datalad.crawler.nodes.misc.get_url_disposition_filename', return_value="T1.nii.gz"): output = list(get_disposition_filename(input)) eq_(len(output), 1) eq_(output[0]['filename'], 'T1.nii.gz')
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def test_get_mixed_hierarchy(src, path): origin = Dataset(src).create(no_annex=True) origin_sub = origin.create('subds') with open(opj(origin.path, 'file_in_git.txt'), "w") as f: f.write('no idea') with open(opj(origin_sub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.add('file_in_git.txt', to_git=True) origin_sub.add('file_in_annex.txt') origin.save(all_changes=True) # now, install that thing: ds, subds = install(path, source=src, recursive=True) ok_(subds.repo.file_has_content("file_in_annex.txt") is False) # and get: with swallow_logs(new_level=logging.DEBUG) as cml: result = ds.get(curdir, recursive=True) assert_re_in('.*Found no annex at {0}. Skipped.'.format(ds), cml.out, flags=re.DOTALL) eq_(len(result), 1) eq_(result[0]['file'], opj("subds", "file_in_annex.txt")) ok_(result[0]['success'] is True) ok_(subds.repo.file_has_content("file_in_annex.txt") is True)
def test_submodule_deinit(path): from datalad.support.annexrepo import AnnexRepo top_repo = AnnexRepo(path, create=False) eq_({'subm 1', '2'}, {s.name for s in top_repo.get_submodules()}) # note: here init=True is ok, since we are using it just for testing with swallow_logs(new_level=logging.WARN) as cml: top_repo.update_submodule('subm 1', init=True) assert_in('Do not use update_submodule with init=True', cml.out) top_repo.update_submodule('2', init=True) # ok_(all([s.module_exists() for s in top_repo.get_submodules()])) # TODO: old assertion above if non-bare? (can't use "direct mode" in test_gitrepo) # Alternatively: New testrepo (plain git submodules) and have a dedicated # test for annexes in addition ok_(all([GitRepo.is_valid_repo(op.join(top_repo.path, s.path)) for s in top_repo.get_submodules()])) # modify submodule: with open(op.join(top_repo.path, 'subm 1', 'file_ut.dat'), "w") as f: f.write("some content") assert_raises(CommandError, top_repo.deinit_submodule, 'sub1') # using force should work: top_repo.deinit_submodule('subm 1', force=True) ok_(not top_repo.repo.submodule('subm 1').module_exists())
def test_uninstall_git_file(path): ds = Dataset(path) ok_(ds.is_installed()) ok_(exists(opj(path, 'INFO.txt'))) ok_file_under_git(ds.repo.path, 'INFO.txt') # drop file in Git in an annex repo # regardless of the type of repo this is 'notneeded'... # it is less about education that about "can we # we get the content back?", and for a file in Git we can assert_result_count( ds.drop(path='INFO.txt'), 1, status='notneeded', message="no annex'ed content") res = ds.uninstall(path="INFO.txt", on_failure='ignore') assert_result_count( res, 1, status='impossible', message='can only uninstall datasets (consider the `drop` command)') # remove the file: res = ds.remove(path='INFO.txt', result_xfm='paths', result_filter=lambda x: x['action'] == 'remove') assert_raises(AssertionError, ok_file_under_git, ds.repo.path, 'INFO.txt') ok_(not exists(opj(path, 'INFO.txt'))) eq_(res, ['INFO.txt'])
def test_get_url_parts(): eq_(au.get_url_parts(""), {}) assert_dict_equal(au.get_url_parts("http://datalad.org"), {"_url_hostname": "datalad.org"}) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), {"_url_hostname": "datalad.org", "_url0": "about.html", "_url_basename": "about.html", "_url_basename_root_py": "about", "_url_basename_ext_py": ".html", "_url_basename_root": "about", "_url_basename_ext": ".html"}) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), au.get_url_parts("http://datalad.org//about.html")) assert_dict_equal( au.get_url_parts("http://datalad.org/for/git-users"), {"_url_hostname": "datalad.org", "_url0": "for", "_url1": "git-users", "_url_basename": "git-users", "_url_basename_root_py": "git-users", "_url_basename_ext_py": "", "_url_basename_root": "git-users", "_url_basename_ext": ""})
def test_GitRepo_pull(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") clone.pull() ok_(op.exists(op.join(clone_path, filename))) # While at it, let's test _get_remotes_having_commit a bit clone.add_remote("very_origin", test_path) clone.fetch("very_origin") eq_( clone._get_remotes_having_commit(clone.get_hexsha()), ['origin'] ) prev_commit = clone.get_hexsha('HEAD^') eq_( set(clone._get_remotes_having_commit(prev_commit)), {'origin', 'very_origin'} )
def _test_match_basic(matcher, query): extracts = dict( xpaths={'text': 'text()'}, csss={'favorite': '.class1::text'} ) m = matcher(query, **extracts) mg = m(dict(response="<div></div>")) ok_(inspect.isgenerator(mg)) eq_(list(mg), []) # there should be no hits mg = m(dict(response=sample1.response)) ok_(inspect.isgenerator(mg)) hits = list(mg) eq_(len(hits), 3) for hit, a_html, a_text, class1_text in zip( hits, sample1.a_htmls, sample1.a_texts, sample1.class1_texts): ok_(hit['response']) eq_(hit['match'], a_html) eq_(hit['text'], a_text) eq_(hit.get('favorite', None), class1_text) m = matcher(query, min_count=4, **extracts) mg = m(dict(response=sample1.response)) ok_(inspect.isgenerator(mg)) assert_raises(ValueError, list, mg) m = matcher(query, max_count=2, **extracts) mg = m(dict(response=sample1.response)) ok_(inspect.isgenerator(mg)) assert_raises(ValueError, list, mg)
def test_kill(path): # nested datasets with load ds = Dataset(path).create() testfile = opj(ds.path, "file.dat") with open(testfile, 'w') as f: f.write("load") ds.save("file.dat") subds = ds.create('deep1') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['deep1']) ok_clean_git(ds.path) # and we fail to remove since content can't be dropped res = ds.remove(on_failure='ignore') assert_result_count( res, 1, status='error', path=testfile) # Following two assertions on message are relying on the actual error. # We have a second result with status 'impossible' for the ds, that we need # to filter out for those assertions: err_result = [r for r in res if r['status'] == 'error'][0] assert_result_values_cond( [err_result], 'message', lambda x: "configured minimum number of copies not found" in x or "Could only verify the existence of 0 out of 1 necessary copies" in x ) eq_(ds.remove(recursive=True, check=False, result_xfm='datasets'), [subds, ds]) ok_(not exists(path))
def test_GitRepo_fetch(test_path, orig_path, clone_path): origin = GitRepo.clone(test_path, orig_path) clone = GitRepo.clone(orig_path, clone_path) filename = get_most_obscure_supported_name() origin.checkout("new_branch", ['-b']) with open(op.join(orig_path, filename), 'w') as f: f.write("New file.") origin.add(filename) origin.commit("new file added.") fetched = clone.fetch(remote='origin') # test FetchInfo list returned by fetch eq_([u'origin/' + clone.get_active_branch(), u'origin/new_branch'], [commit.name for commit in fetched]) ok_clean_git(clone.path, annex=False) assert_in("origin/new_branch", clone.get_remote_branches()) assert_in(filename, clone.get_files("origin/new_branch")) assert_false(op.exists(op.join(clone_path, filename))) # not checked out # create a remote without an URL: origin.add_remote('not-available', 'git://example.com/not/existing') origin.config.unset('remote.not-available.url', where='local') # fetch without provided URL fetched = origin.fetch('not-available') # nothing was done, nothing returned: eq_([], fetched)
def test_aggregate_removal(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = sub.create(opj('subsub'), force=True) base.add('.', recursive=True) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) res = base.metadata(get_aggregates=True) assert_result_count(res, 3) assert_result_count(res, 1, path=subsub.path) # check that we only have object files that are listed in agginfo eq_(_get_contained_objs(base), _get_referenced_objs(base)) # now delete the deepest subdataset to test cleanup of aggregated objects # in the top-level ds base.remove(opj('sub', 'subsub'), check=False) # now aggregation has to detect that subsub is not simply missing, but gone # for good base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) # internally consistent state eq_(_get_contained_objs(base), _get_referenced_objs(base)) # info on subsub was removed at all levels res = base.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 2) res = sub.metadata(get_aggregates=True) assert_result_count(res, 0, path=subsub.path) assert_result_count(res, 1)
def test_GitRepo_get_remote_url(orig_path, path): gr = GitRepo.clone(orig_path, path) gr.add_remote('github', 'git://github.com/datalad/testrepo--basic--r1') eq_(gr.get_remote_url('origin'), orig_path) eq_(gr.get_remote_url('github'), 'git://github.com/datalad/testrepo--basic--r1')
def test_GitRepo_add(src, path): gr = GitRepo.clone(src, path) filename = get_most_obscure_supported_name() with open(op.join(path, filename), 'w') as f: f.write("File to add to git") added = gr.add(filename) eq_(added, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) # uncommitted: ok_(gr.dirty) filename = "another.txt" with open(op.join(path, filename), 'w') as f: f.write("Another file to add to git") # include committing: added2 = gr.add(filename) gr.commit(msg="Add two files.") eq_(added2, {'success': True, 'file': filename}) assert_in(filename, gr.get_indexed_files(), "%s not successfully added to %s" % (filename, path)) ok_clean_git(path)
def test_notclone_known_subdataset(src, path): # get the superdataset: ds = clone(src, path, result_xfm='datasets', return_type='item-or-list') # subdataset not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths')) # clone is not meaningful res = ds.clone('subm 1', on_failure='ignore') assert_status('error', res) assert_message('Failed to clone from any candidate source URL. ' 'Encountered errors per each url were: %s', res) # get does the job res = ds.get(path='subm 1', get_data=False) assert_status('ok', res) ok_(subds.is_installed()) ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False)) # Verify that it is the correct submodule installed and not # new repository initiated eq_(set(subds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
def test_reaggregate_with_unavailable_objects(path): base = Dataset(opj(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(opj(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n') sub = base.create('sub', force=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.add('.', recursive=True) ok_clean_git(base.path) base.aggregate_metadata(recursive=True, update_mode='all') ok_clean_git(base.path) objpath = opj('.datalad', 'metadata', 'objects') objs = list(sorted(base.repo.find(objpath))) # we have 3x2 metadata sets (dataset/files) under annex eq_(len(objs), 6) eq_(all(base.repo.file_has_content(objs)), True) # drop all object content base.drop(objs, check=False) eq_(all(base.repo.file_has_content(objs)), False) ok_clean_git(base.path) # now re-aggregate, the state hasn't changed, so the file names will # be the same base.aggregate_metadata(recursive=True, update_mode='all', force_extraction=True) eq_(all(base.repo.file_has_content(objs)), True) # and there are no new objects eq_( objs, list(sorted(base.repo.find(objpath))) )
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict( [(os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.path, repo.get_git_dir(repo), 'objects', '*', '*'))]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_procedure_discovery(path, super_path): with chpwd(path): # ^ Change directory so that we don't fail with an # InvalidGitRepositoryError if the test is executed from a git # worktree. ps = run_procedure(discover=True) # there are a few procedures coming with datalad, needs to find them assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # set up dataset with registered procedure (c&p from test_basics): ds = Dataset(path).create(force=True) ds.run_procedure('cfg_yoda') ok_clean_git(ds.path) # configure dataset to look for procedures in its code folder ds.config.add('datalad.locations.dataset-procedures', 'code', where='dataset') # configure dataset to run the demo procedure prior to the clean command ds.config.add('datalad.clean.proc-pre', 'datalad_test_proc', where='dataset') ds.save(op.join('.datalad', 'config')) # run discovery on the dataset: ps = ds.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(ds.path, 'code', 'datalad_test_proc.py')) # make it a subdataset and try again: super = Dataset(super_path).create() super.install('sub', source=ds.path) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad assert_true(len(ps) > 2) # we get three essential properties eq_( sum([ 'procedure_type' in p and 'procedure_callfmt' in p and 'path' in p for p in ps ]), len(ps)) # dataset's procedure needs to be in the results assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'datalad_test_proc.py')) if not on_windows: # no symlinks import os # create a procedure which is a broken symlink, but recognizable as a # python script: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'broken_link_proc.py')) # broken symlink at procedure location, but we can't tell, whether it is # an actual procedure without any guess on how to execute it: os.symlink(op.join(super.path, 'sub', 'not_existent'), op.join(super.path, 'sub', 'code', 'unknwon_broken_link')) ps = super.run_procedure(discover=True) # still needs to find procedures coming with datalad and the dataset # procedure registered before assert_true(len(ps) > 3) assert_in_results(ps, path=op.join(super.path, 'sub', 'code', 'broken_link_proc.py'), state='absent') assert_not_in_results(ps, path=op.join(super.path, 'sub', 'code', 'unknwon_broken_link'))
def test_nested_pushclone_cycle_allplatforms(origpath, storepath, clonepath): if 'DATALAD_SEED' in os.environ: # we are using create-sibling-ria via the cmdline in here # this will create random UUIDs for datasets # however, given a fixed seed each call to this command will start # with the same RNG seed, hence yield the same UUID on the same # machine -- leading to a collision raise SkipTest( 'Test incompatible with fixed random number generator seed') # the aim here is this high-level test a std create-push-clone cycle for a # dataset with a subdataset, with the goal to ensure that correct branches # and commits are tracked, regardless of platform behavior and condition # of individual clones. Nothing fancy, just that the defaults behave in # sensible ways from datalad.cmd import WitlessRunner as Runner run = Runner().run # create original nested dataset with chpwd(origpath): run(['datalad', 'create', 'super']) run(['datalad', 'create', '-d', 'super', str(Path('super', 'sub'))]) # verify essential linkage properties orig_super = Dataset(Path(origpath, 'super')) orig_sub = Dataset(orig_super.pathobj / 'sub') (orig_super.pathobj / 'file1.txt').write_text('some1') (orig_sub.pathobj / 'file2.txt').write_text('some1') with chpwd(orig_super.path): run(['datalad', 'save', '--recursive']) # TODO not yet reported clean with adjusted branches #assert_repo_status(orig_super.path) # the "true" branch that sub is on, and the gitsha of the HEAD commit of it orig_sub_corr_branch = \ orig_sub.repo.get_corresponding_branch() or orig_sub.repo.get_active_branch() orig_sub_corr_commit = orig_sub.repo.get_hexsha(orig_sub_corr_branch) # make sure the super trackes this commit assert_in_results( orig_super.subdatasets(), path=orig_sub.path, gitshasum=orig_sub_corr_commit, # TODO it should also track the branch name # Attempted: https://github.com/datalad/datalad/pull/3817 # But reverted: https://github.com/datalad/datalad/pull/4375 ) # publish to a store, to get into a platform-agnostic state # (i.e. no impact of an annex-init of any kind) store_url = 'ria+' + get_local_file_url(storepath) with chpwd(orig_super.path): run([ 'datalad', 'create-sibling-ria', '--recursive', '-s', 'store', store_url ]) run(['datalad', 'push', '--recursive', '--to', 'store']) # we are using the 'store' sibling's URL, which should be a plain path store_super = AnnexRepo(orig_super.siblings(name='store')[0]['url'], init=False) store_sub = AnnexRepo(orig_sub.siblings(name='store')[0]['url'], init=False) # both datasets in the store only carry the real branches, and nothing # adjusted for r in (store_super, store_sub): eq_(set(r.get_branches()), set([orig_sub_corr_branch, 'git-annex'])) # and reobtain from a store cloneurl = 'ria+' + get_local_file_url(str(storepath), compatibility='git') with chpwd(clonepath): run(['datalad', 'clone', cloneurl + '#' + orig_super.id, 'super']) run(['datalad', '-C', 'super', 'get', '--recursive', '.']) # verify that nothing has changed as a result of a push/clone cycle clone_super = Dataset(Path(clonepath, 'super')) clone_sub = Dataset(clone_super.pathobj / 'sub') assert_in_results( clone_super.subdatasets(), path=clone_sub.path, gitshasum=orig_sub_corr_commit, ) for ds1, ds2, f in ((orig_super, clone_super, 'file1.txt'), (orig_sub, clone_sub, 'file2.txt')): eq_((ds1.pathobj / f).read_text(), (ds2.pathobj / f).read_text()) # get status info that does not recursive into subdatasets, i.e. not # looking for uncommitted changes # we should see no modification reported assert_not_in_results(clone_super.status(eval_subdataset_state='commit'), state='modified') # and now the same for a more expensive full status assert_not_in_results(clone_super.status(recursive=True), state='modified')
def test_force_checkdatapresent(srcpath, dstpath): src = Dataset(srcpath).create() target = mk_push_target(src, 'target', dstpath, annex=True, bare=True) (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.") src.save(to_git=False, message="New annex file") assert_repo_status(src.path, annex=True) whereis_prior = src.repo.whereis(files=['test_mod_annex_file'])[0] res = src.push(to='target', data='nothing') # nothing reported to be copied assert_not_in_results(res, action='copy') # we got the git-push nevertheless eq_(src.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH)) # nothing moved eq_(whereis_prior, src.repo.whereis(files=['test_mod_annex_file'])[0]) # now a push without forced no-transfer # we do not give since, so the non-transfered file is picked up # and transferred res = src.push(to='target', force=None) # no branch change, done before assert_in_results(res, action='publish', status='notneeded', refspec=DEFAULT_REFSPEC) # but availability update assert_in_results(res, action='publish', status='ok', refspec='refs/heads/git-annex:refs/heads/git-annex') assert_in_results(res, status='ok', path=str(src.pathobj / 'test_mod_annex_file'), action='copy') # whereis info reflects the change ok_( len(whereis_prior) < len( src.repo.whereis(files=['test_mod_annex_file'])[0])) # do it yet again will do nothing, because all is uptodate assert_status('notneeded', src.push(to='target', force=None)) # an explicit reference point doesn't change that assert_status('notneeded', src.push(to='target', force=None, since='HEAD~1')) # now force data transfer res = src.push(to='target', force='checkdatapresent') # no branch change, done before assert_in_results(res, action='publish', status='notneeded', refspec=DEFAULT_REFSPEC) # no availability update assert_in_results(res, action='publish', status='notneeded', refspec='refs/heads/git-annex:refs/heads/git-annex') # but data transfer assert_in_results(res, status='ok', path=str(src.pathobj / 'test_mod_annex_file'), action='copy') # force data transfer, but data isn't available src.repo.drop('test_mod_annex_file') res = src.push(to='target', path='.', force='checkdatapresent', on_failure='ignore') assert_in_results(res, status='impossible', path=str(src.pathobj / 'test_mod_annex_file'), action='copy', message='Slated for transport, but no content present')
def test_filter_legal_metafield(): eq_(au.filter_legal_metafield(["legal", "_not", "legal_still"]), ["legal", "legal_still"])
def test_rerun_branch(path): ds = Dataset(path).create() ds.repo.tag("prerun") outfile = op.join(path, "run-file") with swallow_outputs(): ds.run('echo x$(cat run-file) > run-file') ds.rerun() eq_('xx\n', open(outfile).read()) with open(op.join(path, "nonrun-file"), "w") as f: f.write("foo") ds.save("nonrun-file") # Rerun the commands on a new branch that starts at the parent # commit of the first run. with swallow_outputs(): ds.rerun(since="prerun", onto="prerun", branch="rerun") eq_(ds.repo.get_active_branch(), "rerun") eq_('xx\n', open(outfile).read()) # NOTE: This test depends on the non-run commit above following a run # commit. Otherwise, all the metadata (e.g., author date) aside from the # parent commit that is used to generate the commit ID may be set when # running the tests, which would result in two commits rather than three. for revrange in ["rerun..master", "master..rerun"]: eq_(len(ds.repo.get_revisions(revrange)), 3) eq_(ds.repo.get_merge_base(["master", "rerun"]), ds.repo.get_hexsha("prerun")) # Start rerun branch at tip of current branch. ds.repo.checkout("master") ds.rerun(since="prerun", branch="rerun2") eq_(ds.repo.get_active_branch(), "rerun2") eq_('xxxx\n', open(outfile).read()) eq_(len(ds.repo.get_revisions("master..rerun2")), 2) eq_(len(ds.repo.get_revisions("rerun2..master")), 0) # Using an existing branch name fails. ds.repo.checkout("master") assert_raises(IncompleteResultsError, ds.rerun, since="prerun", branch="rerun2")
def test_get_flexible_source_candidates_for_submodule(t, t2): f = _get_flexible_source_candidates_for_submodule # for now without mocking -- let's just really build a dataset ds = create(t) clone = install( t2, source=t, result_xfm='datasets', return_type='item-or-list') # first one could just know about itself or explicit url provided sshurl = 'ssh://e.c' httpurl = 'http://e.c' # Expansion with '/.git' no longer done in this helper #sm_httpurls = [httpurl, httpurl + '/.git'] sm_httpurls = [httpurl] eq_(f(ds, 'sub'), []) eq_(f(ds, 'sub', sshurl), [sshurl]) eq_(f(ds, 'sub', httpurl), sm_httpurls) eq_(f(ds, 'sub', None), []) # otherwise really we have no clue were to get from # but if we work on dsclone then it should also add urls deduced from its # own location default remote for current branch eq_(f(clone, 'sub'), [t + '/sub']) eq_(f(clone, 'sub', sshurl), [t + '/sub', sshurl]) eq_(f(clone, 'sub', httpurl), [t + '/sub'] + sm_httpurls) eq_(f(clone, 'sub'), [t + '/sub']) # otherwise really we have no clue were to get from
def test_loads(): eq_(loads('{"a": 2}'), {'a': 2}) with assert_raises(JSONDecodeError),\ swallow_logs(new_level=logging.WARNING) as cml: loads('{"a": 2}x') assert_in('Failed to load content from', cml.out)
def test_load_screwy_unicode(fname): # test that we can tollerate some screwy unicode embeddings within json assert_raises(JSONDecodeError, load, fname, fixup=False) with swallow_logs(new_level=logging.WARNING) as cml: eq_(load(fname), {'Authors': ['A1', 'A2']}) assert_in('Failed to decode content', cml.out)
def test_formatter_missing_arg(): fmt = au.Formatter({}, "NA") eq_(fmt.format("{here},{nothere}", {"here": "ok", "nothere": ""}), "ok,NA")
def test_formatter_placeholder_with_spaces(): fmt = au.Formatter({}) eq_(fmt.format("{with spaces}", {"with spaces": "value0"}), "value0")
def test_split_ext(): eq_(au.split_ext("file"), ("file", "")) eq_(au.split_ext("file.py"), ("file", ".py")) eq_(au.split_ext("file.tar.gz"), ("file", ".tar.gz")) eq_(au.split_ext("file.toolong.gz"), ("file.toolong", ".gz")) eq_(au.split_ext("file.a.b.c.d"), ("file", ".a.b.c.d")) eq_(au.split_ext("file.a.b.cccc.d"), ("file", ".a.b.cccc.d")) eq_(au.split_ext("file.a.b.ccccc.d"), ("file.a.b.ccccc", ".d")) eq_(au.split_ext("file.a.b..c"), ("file", ".a.b..c"))
def test_save(path): ds = Dataset(path) with open(op.join(path, "new_file.tst"), "w") as f: f.write("something") ds.repo.add("new_file.tst", git=True) ok_(ds.repo.dirty) ds.save(message="add a new file") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) with open(op.join(path, "new_file.tst"), "w") as f: f.write("modify") ok_(ds.repo.dirty) ds.save(message="modified new_file.tst") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # save works without ds and files given in the PWD with open(op.join(path, "new_file.tst"), "w") as f: f.write("rapunzel") with chpwd(path): save(message="love rapunzel") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # and also without `-a` when things are staged with open(op.join(path, "new_file.tst"), "w") as f: f.write("exotic") ds.repo.add("new_file.tst", git=True) with chpwd(path): save(message="love marsians") assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) files = ['one.txt', 'two.txt'] for fn in files: with open(op.join(path, fn), "w") as f: f.write(fn) ds.save([op.join(path, f) for f in files]) # superfluous call to save (alll saved it already), should not fail # but report that nothing was saved assert_status('notneeded', ds.save(message="set of new files")) assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # create subdataset subds = ds.create('subds') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # modify subds with open(op.join(subds.path, "some_file.tst"), "w") as f: f.write("something") subds.save() assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) # ensure modified subds is committed ds.save() assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) # now introduce a change downstairs subds.create('someotherds') assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo)) ok_(ds.repo.dirty) # and save via subdataset path ds.save('subds', version_tag='new_sub') assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo)) tags = ds.repo.get_tags() ok_(len(tags) == 1) eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub')) # fails when retagged, like git does res = ds.save(version_tag='new_sub', on_failure='ignore') assert_status('error', res) assert_result_count(res, 1, action='save', type='dataset', path=ds.path, message=('cannot tag this version: %s', "fatal: tag 'new_sub' already exists"))
def test_get_recurse_subdatasets(src, path): ds = install( path, source=src, result_xfm='datasets', return_type='item-or-list') # ask for the two subdatasets specifically. This will obtain them, # but not any content of any files in them subds1, subds2 = ds.get(['subm 1', 'subm 2'], get_data=False, description="youcouldnotmakethisup", result_xfm='datasets') for d in (subds1, subds2): eq_(d.repo.get_description(), 'youcouldnotmakethisup') # there are 3 files to get: test-annex.dat within each dataset: rel_path_sub1 = opj(basename(subds1.path), 'test-annex.dat') rel_path_sub2 = opj(basename(subds2.path), 'test-annex.dat') annexed_files = {'test-annex.dat', rel_path_sub1, rel_path_sub2} # None of them is currently present: ok_(ds.repo.file_has_content('test-annex.dat') is False) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False) ok_clean_git(subds1.path) # explicitly given path in subdataset => implicit recursion: # MIH: Nope, we fulfill the dataset handle, but that doesn't # imply fulfilling all file handles result = ds.get(rel_path_sub1, recursive=True) # all good actions assert_status('ok', result) assert_in_results(result, path=opj(ds.path, rel_path_sub1), status='ok') ok_(subds1.repo.file_has_content('test-annex.dat') is True) # drop it: subds1.repo.drop('test-annex.dat') ok_(subds1.repo.file_has_content('test-annex.dat') is False) # now, with a path not explicitly pointing within a # subdataset, but recursive option: # get everything: result = ds.get(recursive=True, result_filter=lambda x: x.get('type') != 'dataset') assert_status('ok', result) eq_(set([item.get('path')[len(ds.path) + 1:] for item in result if item['type'] == 'file']), annexed_files) ok_(ds.repo.file_has_content('test-annex.dat') is True) ok_(subds1.repo.file_has_content('test-annex.dat') is True) ok_(subds2.repo.file_has_content('test-annex.dat') is True) # drop them: ds.repo.drop('test-annex.dat') subds1.repo.drop('test-annex.dat') subds2.repo.drop('test-annex.dat') ok_(ds.repo.file_has_content('test-annex.dat') is False) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False) # now, the very same call, but without recursive: result = ds.get('.', recursive=False) assert_status('ok', result) # one report is on the requested dir eq_(len(result) - 1, 1) assert_result_count( result, 1, path=opj(ds.path, 'test-annex.dat'), status='ok') ok_(ds.repo.file_has_content('test-annex.dat') is True) ok_(subds1.repo.file_has_content('test-annex.dat') is False) ok_(subds2.repo.file_has_content('test-annex.dat') is False)
def test_aggregation(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(op.join(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') assert_status('ok', ds.save(recursive=True)) # while we are at it: dot it again, nothing should happen assert_status('notneeded', ds.save(recursive=True)) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.meta_aggregate(recursive=True, into='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='meta_aggregate') # the respective super datasets see two saves, one to record the change # in the subdataset after its own aggregation, and one after the super # updated with aggregated metadata assert_result_count(res, 5, status='ok', action='save', type='dataset') # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.meta_dump(reporton='aggregates', recursive=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.meta_dump(recursive=True) # basic sanity check assert_result_count(origres, 3, type='dataset') assert_result_count([r for r in origres if r['path'].endswith('.json')], 3, type='file') # Now that we have annex.key # three different IDs eq_( 3, len( set([ _get_dsid_from_core_metadata(s['metadata']['metalad_core']) for s in origres if s['type'] == 'dataset' ]))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(op.join(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works eq_(ds.id, clone.id) # get fresh metadata cloneres = clone.meta_dump() # basic sanity check assert_result_count(cloneres, 1, type='dataset') # payload file assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_rerun_onto(path): ds = Dataset(path).create() # Make sure we have more than one commit. The one commit case is checked # elsewhere. ds.repo.commit(msg="noop commit", options=["--allow-empty"]) grow_file = op.join(path, "grows") # Make sure we can handle range-specifications that yield no results. for since in ["", "HEAD"]: assert_result_count(ds.rerun("HEAD", onto="", since=since, on_failure="ignore"), 1, status="impossible", action="run") ds.run('echo static-content > static') ds.repo.tag("static") with swallow_outputs(): ds.run('echo x$(cat grows) > grows') ds.rerun() eq_('xx\n', open(grow_file).read()) # If we run the "static" change on top of itself, we end up in the # same (but detached) place. ds.rerun(revision="static", onto="static") ok_(ds.repo.get_active_branch() is None) eq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("static")) # If we run the "static" change from the same "base", we end up # with a new commit. ds.repo.checkout("master") with swallow_outputs(): ds.rerun(revision="static", onto="static^") ok_(ds.repo.get_active_branch() is None) neq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("static")) ok_(all(r["state"] == "clean" for r in ds.diff(fr="HEAD", to="static"))) for revrange in ["..static", "static.."]: eq_(len(ds.repo.get_revisions(revrange)), 1) # Unlike the static change, if we run the ever-growing change on # top of itself, we end up with a new commit. ds.repo.checkout("master") ds.rerun(onto="HEAD") ok_(ds.repo.get_active_branch() is None) neq_(ds.repo.get_hexsha(), ds.repo.get_hexsha("master")) # An empty `onto` means use the parent of the first revision. ds.repo.checkout("master") with swallow_outputs(): ds.rerun(since="static^", onto="") ok_(ds.repo.get_active_branch() is None) for revrange in ["..master", "master.."]: eq_(len(ds.repo.get_revisions(revrange)), 3) # An empty `onto` means use the parent of the first revision that # has a run command. ds.repo.checkout("master") with swallow_outputs(): ds.rerun(since="", onto="", branch="from-base") eq_(ds.repo.get_active_branch(), "from-base") ok_( all(r["state"] == "clean" for r in ds.diff(fr="master", to="from-base"))) eq_(ds.repo.get_merge_base(["static", "from-base"]), ds.repo.get_hexsha("static^")) # We abort when an explicitly specified `onto` doesn't exist. ds.repo.checkout("master") assert_result_count(ds.rerun(since="", onto="doesnotexist", branch="from-base", on_failure="ignore"), 1, status="error", action="run")
def test_openfmri_pipeline2(ind, topurl, outd): # no versioned files -- should still work! ;) list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # backend set, dataset init, crawler, init, incoming (shares with master -1), # (2 or 3 commits, depending on create variant) # incoming-processed, merge, aggregate metadata: ncommits_master = len(commits_hexsha['master']) assert_in(ncommits_master, [5, 6]) assert_in(len(commits_l['master']), [4, 5]) eq_(len(commits_hexsha['incoming']), ncommits_master - 2) eq_(len(commits_l['incoming']), ncommits_master - 2) eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 1) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_l['incoming-processed']), ncommits_master - 2) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'), opj(ind, 'ds666', 'ds666.tar.gz')) with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) eq_(out[0]['datalad_stats'], ActivityStats()) # was committed stats_total = out[0]['datalad_stats'].get_total() stats_total.downloaded_size = 0 eq_( stats_total, ActivityStats(files=4, overwritten=1, skipped=1, downloaded=1, merges=[['incoming', 'incoming-processed']], versions=['1.0.0'], renamed=1, urls=2, add_annex=2)) # in reality there is also 1.0.0+1 tag since file changed but no version suffix eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1']) check_dropall_get(repo)
def test_run_inputs_outputs(src, path): for subds in [("s0", "s1_0", "s2"), ("s0", "s1_1", "s2"), ("s0", "s1_0"), ("s0", "s1_1"), ("s0", "ss"), ("s0", )]: Dataset(op.join(*((src, ) + subds))).create(force=True).save() src_ds = Dataset(src).create(force=True) src_ds.save() ds = install(path, source=src, result_xfm='datasets', return_type='item-or-list') assert_false(ds.repo.file_has_content("input.dat")) assert_false(ds.repo.file_has_content("extra-input.dat")) # The specified inputs and extra inputs will be retrieved before the run. # (Use run_command() to access the extra_inputs argument.) list( run_command("{} {{inputs}} {{inputs}} >doubled.dat".format( 'type' if on_windows else 'cat'), dataset=ds, inputs=["input.dat"], extra_inputs=["extra-input.dat"])) assert_repo_status(ds.path) ok_(ds.repo.file_has_content("input.dat")) ok_(ds.repo.file_has_content("extra-input.dat")) ok_(ds.repo.file_has_content("doubled.dat")) with open(op.join(path, "doubled.dat")) as fh: content = fh.read() assert_in("input", content) assert_not_in("extra-input", content) # Rerunning the commit will also get the input file. ds.repo.drop(["input.dat", "extra-input.dat"], options=["--force"]) assert_false(ds.repo.file_has_content("input.dat")) assert_false(ds.repo.file_has_content("extra-input.dat")) ds.rerun() ok_(ds.repo.file_has_content("input.dat")) ok_(ds.repo.file_has_content("extra-input.dat")) with swallow_logs(new_level=logging.WARN) as cml: ds.run("cd .> dummy", inputs=["not-there"]) assert_in("Input does not exist: ", cml.out) # Test different combinations of globs and explicit files. inputs = ["a.dat", "b.dat", "c.txt", "d.txt"] create_tree(ds.path, {i: i for i in inputs}) ds.save() ds.repo.copy_to(inputs, remote="origin") ds.repo.drop(inputs, options=["--force"]) test_cases = [(["*.dat"], ["a.dat", "b.dat"]), (["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]), (["*"], inputs)] for idx, (inputs_arg, expected_present) in enumerate(test_cases): assert_false(any(ds.repo.file_has_content(i) for i in inputs)) ds.run("cd .> dummy{}".format(idx), inputs=inputs_arg) ok_(all(ds.repo.file_has_content(f) for f in expected_present)) # Globs are stored unexpanded by default. assert_in(inputs_arg[0], ds.repo.format_commit("%B")) ds.repo.drop(inputs, options=["--force"]) # --input can be passed a subdirectory. create_tree(ds.path, {"subdir": {"a": "subdir a", "b": "subdir b"}}) ds.save("subdir") ds.repo.copy_to(["subdir/a", "subdir/b"], remote="origin") ds.repo.drop("subdir", options=["--force"]) ds.run("cd .> subdir-dummy", inputs=[op.join(ds.path, "subdir")]) ok_(all( ds.repo.file_has_content(op.join("subdir", f)) for f in ["a", "b"])) # Inputs are specified relative to a dataset's subdirectory. ds.repo.drop(op.join("subdir", "a"), options=["--force"]) with chpwd(op.join(path, "subdir")): run("cd .> subdir-dummy1", inputs=["a"]) ok_(ds.repo.file_has_content(op.join("subdir", "a"))) # --input=. runs "datalad get ." ds.run("cd .> dot-dummy", inputs=["."]) eq_(ds.repo.get_annexed_files(), ds.repo.get_annexed_files(with_content_only=True)) # On rerun, we get all files, even those that weren't in the tree at the # time of the run. create_tree(ds.path, {"after-dot-run": "after-dot-run content"}) ds.save() ds.repo.copy_to(["after-dot-run"], remote="origin") ds.repo.drop(["after-dot-run"], options=["--force"]) ds.rerun("HEAD^") ds.repo.file_has_content("after-dot-run") # --output will unlock files that are present. ds.repo.get("a.dat") ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) with open(op.join(path, "a.dat")) as fh: eq_(fh.read(), "a.dat' appended' \n" if on_windows else "a.dat appended\n") # --output will remove files that are not present. ds.repo.drop(["a.dat", "d.txt"], options=["--force"]) ds.run("echo ' appended' >>a.dat", outputs=["a.dat"]) if not on_windows: # MIH doesn't yet understand how to port this with open(op.join(path, "a.dat")) as fh: eq_(fh.read(), " appended\n") # --input can be combined with --output. ds.repo.call_git(["reset", "--hard", "HEAD~2"]) ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"]) if not on_windows: # MIH doesn't yet understand how to port this with open(op.join(path, "a.dat")) as fh: eq_(fh.read(), "a.dat appended\n") if not on_windows: # see datalad#2606 with swallow_logs(new_level=logging.DEBUG) as cml: with swallow_outputs(): ds.run("echo blah", outputs=["not-there"]) assert_in("Filtered out non-existing path: ", cml.out) ds.create('sub') ds.run("echo sub_orig >sub/subfile") ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) ds.drop("sub/subfile", check=False) ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"]) # --input/--output globs can be stored in expanded form. ds.run("cd .> expand-dummy", inputs=["a.*"], outputs=["b.*"], expand="both") assert_in("a.dat", ds.repo.format_commit("%B")) assert_in("b.dat", ds.repo.format_commit("%B")) res = ds.rerun(report=True, return_type='item-or-list') eq_(res["run_info"]['inputs'], ["a.dat"]) eq_(res["run_info"]['outputs'], ["b.dat"]) # We install subdatasets to fully resolve globs. ds.uninstall("s0") assert_false(Dataset(op.join(path, "s0")).is_installed()) ds.run("echo {inputs} >globbed-subds", inputs=["s0/s1_*/s2/*.dat"]) ok_file_has_content(op.join(ds.path, "globbed-subds"), "'s0\\s1_0\\s2\\a.dat' 's0\\s1_1\\s2\\c.dat'" if on_windows else "s0/s1_0/s2/a.dat s0/s1_1/s2/c.dat", strip=True) ds_ss = Dataset(op.join(path, "s0", "ss")) assert_false(ds_ss.is_installed()) ds.run("echo blah >{outputs}", outputs=["s0/ss/out"]) ok_(ds_ss.is_installed()) ok_file_has_content(op.join(ds.path, "s0", "ss", "out"), "blah", strip=True)
def test_clean_subds_removal(path): ds = Dataset(path).create() subds1 = ds.create('one') subds2 = ds.create('two') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['one', 'two']) assert_repo_status(ds.path) # now kill one res = ds.remove('one', result_xfm=None) # subds1 got uninstalled, and ds got the removal of subds1 saved assert_result_count(res, 1, path=subds1.path, action='uninstall', status='ok') assert_result_count(res, 1, path=subds1.path, action='remove', status='ok') assert_result_count(res, 1, path=ds.path, action='save', status='ok') ok_(not subds1.is_installed()) assert_repo_status(ds.path) # two must remain eq_(ds.subdatasets(result_xfm='relpaths'), ['two']) # one is gone assert (not exists(subds1.path)) # and now again, but this time remove something that is not installed ds.create('three') eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) ds.uninstall('two') assert_repo_status(ds.path) eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two']) ok_(not subds2.is_installed()) assert (exists(subds2.path)) res = ds.remove('two', result_xfm='datasets') assert_repo_status(ds.path) # subds2 was already uninstalled, now ds got the removal of subds2 saved assert (not exists(subds2.path)) eq_(ds.subdatasets(result_xfm='relpaths'), ['three']) eq_(res, [subds2, ds])
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def test_get_subpaths(): for fname, expect in [("no/dbl/slash", ("no/dbl/slash", [])), ("p1//n", ("p1/n", ["p1"])), ("p1//p2/p3//n", ("p1/p2/p3/n", ["p1", "p1/p2/p3"])), ("//n", ("/n", [""])), ("n//", ("n/", ["n"]))]: eq_(au.get_subpaths(fname), expect)
def test_path_diff(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if not on_windows: # TODO test should also be has_symlink_capability(), but # something in the repo base class is not behaving yet # check the premise of this test assert ds.pathobj != ds.repo.pathobj plain_recursive = ds.diff(recursive=True, annex='all') # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in text_type(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.diff(path='.', recursive=True, annex='all')) # duplicate paths do not change things eq_(plain_recursive, ds.diff(path=['.', '.'], recursive=True, annex='all')) # neither do nested paths eq_(plain_recursive, ds.diff(path=['.', 'subds_modified'], recursive=True, annex='all')) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = diff(recursive=True, annex='all') # should be able to take absolute paths and yield the same # output eq_(plain_recursive, ds.diff(path=ds.path, recursive=True, annex='all')) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', u'{}_directory_untracked'.format(OBSCURE_FILENAME)) apathobj = ds.pathobj / rpath apath = text_type(apathobj) for p in (rpath, apath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.path): res = ds.diff(path=op.join('.', rpath), recursive=True, annex='all') else: res = ds.diff(path=p, recursive=True, annex='all') assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.diff(recursive=True), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.diff(recursive=True, recursion_limit=1), 0, path=apath) # negative limit is unlimited limit eq_(ds.diff(recursive=True, recursion_limit=-1), ds.diff(recursive=True))
def func2(x): assert x == 1 eq_(ui.yesno("title"), True) eq_(ui.question("title2"), "maybe so") assert_raises(AssertionError, ui.question, "asking more than we know") return x * 2
def test_a_href_match_basic(): m = a_href_match('.*') mg = m(dict(response=sample1.response)) ok_(inspect.isgenerator(mg)) hits = list(mg) eq_(len(hits), 3) eq_([u['url_text'] for u in hits], sample1.a_texts) eq_([u['url_href'] for u in hits], sample1.a_url_hrefs) # nothing done to url eq_([u['url'] for u in hits], sample1.a_url_hrefs) # if we do provide original url where it comes from -- result urls should be full mg = m(dict(response=sample1.response, url="http://w.example.com:888/d/")) ok_(inspect.isgenerator(mg)) hits = list(mg) eq_(len(hits), 3) eq_([u['url_text'] for u in hits], sample1.a_texts) eq_([u['url_href'] for u in hits], sample1.a_url_hrefs) eq_([u['url'] for u in hits], ['http://w.example.com:888/', 'http://w.example.com:888/d/buga/duga/du', 'http://example.com'])
def test_repo_diff(path, norepo): ds = Dataset(path).create() assert_repo_status(ds.path) assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE') # no diff eq_(ds.repo.diff('HEAD', None), {}) # bogus path makes no difference eq_(ds.repo.diff('HEAD', None, paths=['THIS']), {}) # let's introduce a known change create_tree(ds.path, {'new': 'empty'}) ds.save(to_git=True) assert_repo_status(ds.path) eq_( ds.repo.diff(fr='HEAD~1', to='HEAD'), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'added', 'type': 'file', 'bytesize': 5, 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # modify known file create_tree(ds.path, {'new': 'notempty'}) eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'modified', 'type': 'file', # the beast is modified, but no change in shasum -> not staged 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6', 'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # per path query gives the same result eq_(ds.repo.diff(fr='HEAD', to=None), ds.repo.diff(fr='HEAD', to=None, paths=['new'])) # also given a directory as a constraint does the same eq_(ds.repo.diff(fr='HEAD', to=None), ds.repo.diff(fr='HEAD', to=None, paths=['.'])) # but if we give another path, it doesn't show up eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {}) # make clean ds.save() assert_repo_status(ds.path) # untracked stuff create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}}) # default is to report all files eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } }) # but can be made more compact eq_( ds.repo.diff(fr='HEAD', to=None, untracked='normal'), { ut.Path(ds.repo.pathobj / 'deep'): { 'state': 'untracked', 'type': 'directory' } }) # again a unmatching path constrainted will give an empty report eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {}) # perfect match and anything underneath will do eq_( ds.repo.diff(fr='HEAD', to=None, paths=['deep']), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } })
def test_update_strategy(path): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = sub.create(op.join('subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we start clean for ds in base, sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the base dataset only, nothing below changes base.meta_aggregate() eq_(len(_get_contained_objs(base)), 2) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) # aggregate the entire tree, but by default only updates # the top-level dataset with all objects, none of the leaf # or intermediate datasets get's touched base.meta_aggregate(recursive=True) eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_referenced_objs(base)), 6) for ds in sub, subsub: eq_(len(_get_contained_objs(ds)), 0) res = base.meta_dump(reporton='aggregates', recursive=True) assert_result_count(res, 3) # it is impossible to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('impossible', ds.meta_dump(reporton='aggregates', on_failure='ignore')) # get the full metadata report target_meta = _kill_time(base.meta_dump()) # now redo full aggregation, this time updating all # (intermediate) datasets base.meta_aggregate(recursive=True, into='all') eq_(len(_get_contained_objs(base)), 6) eq_(len(_get_contained_objs(sub)), 4) eq_(len(_get_contained_objs(subsub)), 2) # it is now OK to query an intermediate or leaf dataset # for metadata for ds in sub, subsub: assert_status('ok', ds.meta_dump(reporton='aggregates', on_failure='ignore')) # all of that has no impact on the reported metadata # minus the change in the refcommits for i in zip(target_meta, _kill_time(base.meta_dump())): assert_dict_equal(i[0], i[1])
def test_setup(): # just verify that we monkey patched consts correctly from datalad.consts import DATASETS_TOPURL eq_(DATASETS_TOPURL, 'http://datasets-tests.datalad.org/') from datalad.tests.utils import get_datasets_topdir eq_(get_datasets_topdir(), 'datasets-tests.datalad.org')
def check_push(annex, src_path, dst_path): # prepare src src = Dataset(src_path).create(annex=annex) src_repo = src.repo # push should not add branches to the local dataset orig_branches = src_repo.get_branches() assert_not_in('synced/' + DEFAULT_BRANCH, orig_branches) res = src.push(on_failure='ignore') assert_result_count(res, 1) assert_in_results( res, status='impossible', message='No push target given, and none could be auto-detected, ' 'please specify via --to') eq_(orig_branches, src_repo.get_branches()) # target sibling target = mk_push_target(src, 'target', dst_path, annex=annex) eq_(orig_branches, src_repo.get_branches()) res = src.push(to="target") eq_(orig_branches, src_repo.get_branches()) assert_result_count(res, 2 if annex else 1) assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['new-branch']) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # configure a default merge/upstream target src.config.set('branch.{}.remote'.format(DEFAULT_BRANCH), 'target', where='local') src.config.set('branch.{}.merge'.format(DEFAULT_BRANCH), DEFAULT_BRANCH, where='local') # don't fail when doing it again, no explicit target specification # needed anymore res = src.push() eq_(orig_branches, src_repo.get_branches()) # and nothing is pushed assert_status('notneeded', res) assert_repo_status(src_repo, annex=annex) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # some modification: (src.pathobj / 'test_mod_file').write_text("Some additional stuff.") src.save(to_git=True, message="Modified.") (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.") src.save(to_git=not annex, message="Modified again.") assert_repo_status(src_repo, annex=annex) # we could say since='HEAD~2' to make things fast, or we are lazy # and say since='^' to indicate the state of the tracking remote # which is the same, because we made to commits since the last push. res = src.push(to='target', since="^", jobs=2) assert_in_results( res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, # we get to see what happened operations=['fast-forward']) if annex: # we got to see the copy result for the annexed files assert_in_results(res, action='copy', status='ok', path=str(src.pathobj / 'test_mod_annex_file')) # we published, so we can drop and reobtain ok_(src_repo.file_has_content('test_mod_annex_file')) src_repo.drop('test_mod_annex_file') ok_(not src_repo.file_has_content('test_mod_annex_file')) src_repo.get('test_mod_annex_file') ok_(src_repo.file_has_content('test_mod_annex_file')) ok_file_has_content(src_repo.pathobj / 'test_mod_annex_file', 'Heavy stuff.') eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) if not (annex and src_repo.is_managed_branch()): # the following doesn't make sense in managed branches, because # a commit that could be amended is no longer the last commit # of a branch after a sync has happened (which did happen # during the last push above # amend and change commit msg in order to test for force push: src_repo.commit("amended", options=['--amend']) # push should be rejected (non-fast-forward): res = src.push(to='target', since='HEAD~2', on_failure='ignore') # fails before even touching the annex branch assert_in_results(res, action='publish', status='error', target='target', refspec=DEFAULT_REFSPEC, operations=['rejected', 'error']) # push with force=True works: res = src.push(to='target', since='HEAD~2', force='gitpush') assert_in_results(res, action='publish', status='ok', target='target', refspec=DEFAULT_REFSPEC, operations=['forced-update']) eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)), list(src_repo.get_branch_commits_(DEFAULT_BRANCH))) # we do not have more branches than we had in the beginning # in particular no 'synced/<default branch>' eq_(orig_branches, src_repo.get_branches())
def test_push_recursive(origin_path, src_path, dst_top, dst_sub, dst_subnoannex, dst_subsub): # dataset with two submodules and one subsubmodule origin = Dataset(origin_path).create() origin_subm1 = origin.create('sub m') origin_subm1.create('subsub m') origin.create('subm noannex', annex=False) origin.save() assert_repo_status(origin.path) # prepare src as a fresh clone with all subdatasets checkout out recursively # running on a clone should make the test scenario more different than # test_push(), even for the pieces that should be identical top = Clone.__call__(source=origin.path, path=src_path) subs = top.get('.', recursive=True, get_data=False, result_xfm='datasets') # order for '.' should not be relied upon, so sort by path sub, subsub, subnoannex = sorted(subs, key=lambda ds: ds.path) target_top = mk_push_target(top, 'target', dst_top, annex=True) # subdatasets have no remote yet, so recursive publishing should fail: res = top.push(to="target", recursive=True, on_failure='ignore') assert_in_results(res, path=top.path, type='dataset', refspec=DEFAULT_REFSPEC, operations=['new-branch'], action='publish', status='ok', target='target') for d in (sub, subsub, subnoannex): assert_in_results(res, status='error', type='dataset', path=d.path, message=("Unknown target sibling '%s'.", 'target')) # now fix that and set up targets for the submodules target_sub = mk_push_target(sub, 'target', dst_sub, annex=True) target_subnoannex = mk_push_target(subnoannex, 'target', dst_subnoannex, annex=False) target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True) # and same push call as above res = top.push(to="target", recursive=True) # topds skipped assert_in_results(res, path=top.path, type='dataset', action='publish', status='notneeded', target='target') # the rest pushed for d in (sub, subsub, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # all correspondig branches match across all datasets for s, d in zip( (top, sub, subnoannex, subsub), (target_top, target_sub, target_subnoannex, target_subsub)): eq_(list(s.repo.get_branch_commits_(DEFAULT_BRANCH)), list(d.get_branch_commits_(DEFAULT_BRANCH))) if s != subnoannex: eq_(list(s.repo.get_branch_commits_("git-annex")), list(d.get_branch_commits_("git-annex"))) # rerun should not result in further pushes of the default branch res = top.push(to="target", recursive=True) assert_not_in_results(res, status='ok', refspec=DEFAULT_REFSPEC) assert_in_results(res, status='notneeded', refspec=DEFAULT_REFSPEC) # now annex a file in subsub test_copy_file = subsub.pathobj / 'test_mod_annex_file' test_copy_file.write_text("Heavy stuff.") # save all the way up assert_status(('ok', 'notneeded'), top.save(message='subsub got something', recursive=True)) assert_repo_status(top.path) # publish straight up, should be smart by default res = top.push(to="target", recursive=True) # we see 3 out of 4 datasets pushed (sub noannex was left unchanged) for d in (top, sub, subsub): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # file content copied too assert_in_results(res, action='copy', status='ok', path=str(test_copy_file)) # verify it is accessible, drop and bring back assert_status('ok', top.drop(str(test_copy_file))) ok_(not subsub.repo.file_has_content('test_mod_annex_file')) top.get(test_copy_file) ok_file_has_content(test_copy_file, 'Heavy stuff.') # make two modification (sub.pathobj / 'test_mod_annex_file').write_text('annex') (subnoannex.pathobj / 'test_mod_file').write_text('git') # save separately top.save(sub.pathobj, message='annexadd', recursive=True) top.save(subnoannex.pathobj, message='gitadd', recursive=True) # now only publish the latter one res = top.push(to="target", since=DEFAULT_BRANCH + '~1', recursive=True) # nothing copied, no reports on the other modification assert_not_in_results(res, action='copy') assert_not_in_results(res, path=sub.path) for d in (top, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # an unconditional push should now pick up the remaining changes res = top.push(to="target", recursive=True) assert_in_results(res, action='copy', status='ok', path=str(sub.pathobj / 'test_mod_annex_file')) assert_in_results(res, status='ok', type='dataset', path=sub.path, refspec=DEFAULT_REFSPEC) for d in (top, subnoannex, subsub): assert_in_results(res, status='notneeded', type='dataset', path=d.path, refspec=DEFAULT_REFSPEC) # if noannex target gets some annex, we still should not fail to push target_subnoannex.call_git(['annex', 'init']) # just to ensure that we do need something to push (subnoannex.pathobj / "newfile").write_text("content") subnoannex.save() res = subnoannex.push(to="target") assert_in_results(res, status='ok', type='dataset')
def test_formatter_lower_case(): fmt = au.Formatter({0: "key"}) eq_(fmt.format("{key!l}", {"key": "UP"}), "up") eq_(fmt.format("{0!l}", {"key": "UP"}), "up") eq_(fmt.format("{other!s}", {}, other=[1, 2]), "[1, 2]")
def f(arg, kwarg=None): eq_(arg, 1) eq_(kwarg, 2) eq_(getpwd(), d)