Exemple #1
0
def test_dirty(path):
    for mode in _dirty_modes:
        # does nothing without a dataset
        handle_dirty_dataset(None, mode)
    # placeholder, but not yet created
    ds = Dataset(path)
    # unknown mode
    assert_raises(ValueError, handle_dirty_dataset, ds, 'MADEUP')
    # not yet created is very dirty
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'fail')
    handle_dirty_dataset(ds, 'ignore')
    assert_raises(RuntimeError, handle_dirty_dataset, ds, 'save-before')
    # should yield a clean repo
    ds.create()
    orig_state = ds.repo.get_hexsha()
    _check_all_clean(ds, orig_state)
    # tainted: untracked
    with open(opj(ds.path, 'something'), 'w') as f:
        f.write('some')
    # we don't want to auto-add untracked files by saving (anymore)
    assert_raises(AssertionError, _check_auto_save, ds, orig_state)
    # tainted: staged
    ds.repo.add('something', git=True)
    orig_state = _check_auto_save(ds, orig_state)
    # tainted: submodule
    # not added to super on purpose!
    subds = ds.create('subds')
    _check_all_clean(subds, subds.repo.get_hexsha())
    ok_clean_git(ds.path)
    # subdataset must be added as a submodule!
    assert_equal(ds.subdatasets(result_xfm='relpaths'), ['subds'])
Exemple #2
0
def test_diff_nonexistent_ref_unicode(path):
    ds = Dataset(path).create()
    assert_result_count(
        ds.diff(fr="HEAD", to=u"β", on_failure="ignore"),
        1,
        path=ds.path,
        status="impossible")
Exemple #3
0
def test_unlock_raises(path, path2, path3):

    # make sure, we are not within a dataset:
    _cwd = getpwd()
    chpwd(path)

    # no dataset and no path:
    assert_raises(InsufficientArgumentsError,
                  unlock, dataset=None, path=None)
    # no dataset and path not within a dataset:
    with swallow_logs(new_level=logging.WARNING) as cml:
        unlock(dataset=None, path=path2)
        assert_in("ignored paths that do not belong to any dataset: ['{0}'".format(path2),
                  cml.out)

    create(path=path, no_annex=True)
    ds = Dataset(path)
    # no complaints
    ds.unlock()

    # make it annex, but call unlock with invalid path:
    AnnexRepo(path, create=True)
    with swallow_logs(new_level=logging.WARNING) as cml:
        ds.unlock(path="notexistent.txt")
        assert_in("ignored non-existing paths", cml.out)

    chpwd(_cwd)
Exemple #4
0
def test_get_metadata(path):

    ds = Dataset(path).create(force=True)
    ds.save()
    meta = MetadataExtractor(ds, [])._get_dataset_metadata()
    assert_equal(
        dumps(meta, sort_keys=True, indent=2),
        """\
{
  "citation": "Cool (2016)",
  "conformsto": "http://docs.datalad.org/metadata.html#v0-1",
  "description": "A text with arbitrary length and content that can span multiple\\nparagraphs (this is a new one)",
  "fundedby": "BMBFGQ1411, NSF 1429999",
  "homepage": "http://studyforrest.org",
  "issuetracker": "https://github.com/psychoinformatics-de/studyforrest-data-phase2/issues",
  "license": [
    "CC0",
    "The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law.\\nYou can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission."
  ],
  "maintainer": [
    "Mike One <*****@*****.**>",
    "Anna Two <*****@*****.**>"
  ],
  "name": "studyforrest_phase2",
  "sameas": "http://dx.doi.org/10.5281/zenodo.48421",
  "shortdescription": "Basic summary",
  "version": "1.0.0-rc3"
}""")
Exemple #5
0
def test_runnin_on_empty(path):
    # empty repo
    repo = AnnexRepo(path, create=True)
    # just wrap with a dataset
    ds = Dataset(path)
    # and run status ... should be good and do nothing
    eq_([], ds.status())
Exemple #6
0
def test_gh2043p1(path):
    # this tests documents the interim agreement on what should happen
    # in the case documented in gh-2043
    ds = Dataset(path).create(force=True)
    ds.save('1')
    assert_repo_status(ds.path, untracked=['2', '3'])
    ds.unlock('1')
    assert_repo_status(
        ds.path,
        # on windows we are in an unlocked branch by default, hence
        # we would see no change
        modified=[] if on_windows else ['1'],
        untracked=['2', '3'])
    # save(.) should recommit unlocked file, and not touch anything else
    # this tests the second issue in #2043
    with chpwd(path):
        # only save modified bits
        save(path='.', updated=True)
    # state of the file (unlocked/locked) is committed as well, and the
    # test doesn't lock the file again
    assert_repo_status(ds.path, untracked=['2', '3'])
    with chpwd(path):
        # but when a path is given, anything that matches this path
        # untracked or not is added/saved
        save(path='.')
    # state of the file (unlocked/locked) is committed as well, and the
    # test doesn't lock the file again
    assert_repo_status(ds.path)
Exemple #7
0
def test_symlinked_relpath(path):
    # initially ran into on OSX https://github.com/datalad/datalad/issues/2406
    os.makedirs(op.join(path, "origin"))
    dspath = op.join(path, "linked")
    os.symlink('origin', dspath)
    ds = Dataset(dspath).create()
    create_tree(dspath, {
        "mike1": 'mike1',  # will be added from topdir
        "later": "later",  # later from within subdir
        "d": {
            "mike2": 'mike2', # to be added within subdir
        }
    })

    # in the root of ds
    with chpwd(dspath):
        ds.repo.add("mike1", git=True)
        ds.save(message="committing", path="./mike1")

    # Let's also do in subdirectory as CWD, check that relative path
    # given to a plain command (not dataset method) are treated as
    # relative to CWD
    with chpwd(op.join(dspath, 'd')):
        save(dataset=ds.path,
             message="committing",
             path="mike2")

        later = op.join(op.pardir, "later")
        ds.repo.add(later, git=True)
        save(dataset=ds.path, message="committing", path=later)

    assert_repo_status(dspath)
Exemple #8
0
def test_unlock_raises(path, path2, path3):

    # make sure, we are not within a dataset:
    _cwd = getpwd()
    chpwd(path)

    # no dataset and no path:
    assert_raises(InsufficientArgumentsError,
                  unlock, dataset=None, path=None)
    # no dataset and path not within a dataset:
    res = unlock(dataset=None, path=path2, result_xfm=None,
                 on_failure='ignore', return_type='item-or-list')
    eq_(res['message'], "path not associated with any dataset")
    eq_(res['path'], path2)

    create(path=path, no_annex=True)
    ds = Dataset(path)
    # no complaints
    ds.unlock()

    # make it annex, but call unlock with invalid path:
    AnnexRepo(path, create=True)
    res = ds.unlock(path="notexistent.txt", result_xfm=None,
                    on_failure='ignore', return_type='item-or-list')
    eq_(res['message'], "path does not exist")

    chpwd(_cwd)
Exemple #9
0
def test_subdataset_save(path):
    parent = Dataset(path).create()
    sub = parent.create('sub')
    assert_repo_status(parent.path)
    create_tree(parent.path, {
        "untracked": 'ignore',
        'sub': {
            "new": "wanted"}})
    sub.save('new')
    # defined state: one untracked, modified (but clean in itself) subdataset
    assert_repo_status(sub.path)
    assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])

    # `save sub` does not save the parent!!
    with chpwd(parent.path):
        assert_status('notneeded', save(dataset=sub.path))
    assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])
    # `save -u .` saves the state change in the subdataset,
    # but leaves any untracked content alone
    with chpwd(parent.path):
        assert_status('ok', parent.save(updated=True))
    assert_repo_status(parent.path, untracked=['untracked'])

    # get back to the original modified state and check that -S behaves in
    # exactly the same way
    create_tree(parent.path, {
        'sub': {
            "new2": "wanted2"}})
    sub.save('new2')
    assert_repo_status(parent.path, untracked=['untracked'], modified=['sub'])
Exemple #10
0
def test_run_from_subds(path):
    if 'APPVEYOR' in os.environ:
        raise SkipTest('test causes appveyor (only) to crash, reason unknown')

    subds = Dataset(path).create().create("sub")
    subds.run("cd .> foo")
    assert_repo_status(subds.path)
Exemple #11
0
def test_rerun_just_one_commit(path):
    ds = Dataset(path).create()

    # Check out an orphan branch so that we can test the "one commit
    # in a repo" case.
    ds.repo.checkout("orph", options=["--orphan"])
    ds.repo.repo.git.reset("--hard")
    ds.repo.config.reload()

    ds.run('echo static-content > static')
    assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1)

    # Rerunning with just one commit doesn't raise an error ...
    ds.rerun()
    # ... but we're still at one commit because the content didn't
    # change.
    assert_result_count(ds.repo.repo.git.rev_list("HEAD").split(), 1)

    # We abort rather than trying to do anything when --onto='' and
    # --since='' are given together and the first commit contains a
    # run command.
    ds.repo.commit(msg="empty", options=["--allow-empty"])
    assert_raises(IncompleteResultsError, ds.rerun, since="", onto="")

    # --script propagates the error.
    with swallow_outputs():
        assert_raises(IncompleteResultsError,
                      ds.rerun, since="", onto="", script="-")
    # --dry-run propagates the error.
    assert_raises(IncompleteResultsError,
                  ds.rerun, since="", onto="",
                  report=True, return_type="list")
Exemple #12
0
def test_add_recursive(path):
    # make simple hierarchy
    parent = Dataset(path).create()
    assert_repo_status(parent.path)
    sub1 = parent.create(op.join('down', 'sub1'))
    assert_repo_status(parent.path)
    sub2 = parent.create('sub2')
    # next one make the parent dirty
    subsub = sub2.create('subsub')
    assert_repo_status(parent.path, modified=['sub2'])
    res = parent.save()
    assert_repo_status(parent.path)

    # now add content deep in the hierarchy
    create_tree(subsub.path, {'new': 'empty'})
    assert_repo_status(parent.path, modified=['sub2'])

    # recursive add should not even touch sub1, because
    # it knows that it is clean
    res = parent.save(recursive=True)
    # the key action is done
    assert_result_count(
        res, 1, path=op.join(subsub.path, 'new'), action='add', status='ok')
    # saved all the way up
    assert_result_count(res, 3, action='save', status='ok')
    assert_repo_status(parent.path)
Exemple #13
0
 def check_renamed_file(recursive, no_annex, path):
     ds = Dataset(path).create(no_annex=no_annex)
     create_tree(path, {'old': ''})
     ds.add('old')
     ds.repo._git_custom_command(['old', 'new'], ['git', 'mv'])
     ds._save(recursive=recursive)
     ok_clean_git(path)
Exemple #14
0
def test_status_basics(path, linkpath, otherdir):
    if not on_windows:
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(path, target_is_directory=True)
        path = linkpath

    with chpwd(path):
        assert_raises(NoDatasetArgumentFound, status)
    ds = Dataset(path).create()
    # outcome identical between ds= and auto-discovery
    with chpwd(path):
        assert_raises(IncompleteResultsError, status, path=otherdir)
        stat = status()
    eq_(stat, ds.status())
    assert_status('ok', stat)
    # we have a bunch of reports (be vague to be robust to future changes
    assert len(stat) > 2
    # check the composition
    for s in stat:
        eq_(s['status'], 'ok')
        eq_(s['action'], 'status')
        eq_(s['state'], 'clean')
        eq_(s['type'], 'file')
        assert_in('gitshasum', s)
        assert_in('bytesize', s)
        eq_(s['refds'], ds.path)
Exemple #15
0
def test_add_mimetypes(path):
    ds = Dataset(path).create(force=True)
    ds.repo.add('.gitattributes')
    ds.repo.commit('added attributes to git explicitly')
    # now test that those files will go into git/annex correspondingly
    # WINDOWS FAILURE NEXT
    __not_tested__ = ds.save(['file.txt', 'empty'])
    assert_repo_status(path, untracked=['file2.txt'])
    # But we should be able to force adding file to annex when desired
    ds.save('file2.txt', to_git=False)
    # check annex file status
    annexinfo = ds.repo.get_content_annexinfo()
    for path, in_annex in (
           # Empty one considered to be  application/octet-stream
           # i.e. non-text
           ('empty', True),
           ('file.txt', False),
           ('file2.txt', True)):
        # low-level API report -> repo path reference, no ds path
        p = ds.repo.pathobj / path
        assert_in(p, annexinfo)
        if in_annex:
            assert_in('key', annexinfo[p], p)
        else:
            assert_not_in('key', annexinfo[p], p)
Exemple #16
0
def test_add_subdataset(path, other):
    subds = create(op.join(path, 'dir'), force=True)
    ds = create(path, force=True)
    ok_(subds.repo.dirty)
    ok_(ds.repo.dirty)
    assert_not_in('dir', ds.subdatasets(result_xfm='relpaths'))
    # "add everything in subds to subds"
    save(dataset=subds.path)
    assert_repo_status(subds.path)
    assert_not_in('dir', ds.subdatasets(result_xfm='relpaths'))
    # but with a base directory we add the dataset subds as a subdataset
    # to ds
    res = ds.save(subds.path)
    assert_in_results(res, action="add", path=subds.path, refds=ds.path)
    assert_in('dir', ds.subdatasets(result_xfm='relpaths'))
    #  create another one
    other = create(other)
    # install into superdataset, but don't add
    other_clone = install(source=other.path, path=op.join(ds.path, 'other'))
    # little dance to get the revolution-type dataset
    other_clone = Dataset(other_clone.path)
    ok_(other_clone.is_installed)
    assert_not_in('other', ds.subdatasets(result_xfm='relpaths'))
    # now add, it should pick up the source URL
    ds.save('other')
    # and that is why, we can reobtain it from origin
    ds.uninstall('other')
    ok_(not other_clone.is_installed())
    ds.get('other')
    ok_(other_clone.is_installed())
Exemple #17
0
def test_add_files(path):
    ds = Dataset(path).create(force=True)

    test_list_1 = ['test_annex.txt']
    test_list_2 = ['test.txt']
    test_list_3 = ['test1.dat', 'test2.dat']
    test_list_4 = [op.join('dir', 'testindir'),
                   op.join('dir', OBSCURE_FILENAME)]

    for arg in [(test_list_1[0], False),
                (test_list_2[0], True),
                (test_list_3, False),
                (test_list_4, False)]:
        # special case 4: give the dir:
        if arg[0] == test_list_4:
            result = ds.save('dir', to_git=arg[1])
            status = ds.repo.annexstatus(['dir'])
        else:
            result = ds.save(arg[0], to_git=arg[1])
            for a in assure_list(arg[0]):
                assert_result_count(result, 1, path=text_type(ds.pathobj / a))
            status = ds.repo.get_content_annexinfo(
                ut.Path(p) for p in assure_list(arg[0]))
        for f, p in iteritems(status):
            if arg[1]:
                assert p.get('key', None) is None, f
            else:
                assert p.get('key', None) is not None, f
Exemple #18
0
def test_rerun_ambiguous_revision_file(path):
    ds = Dataset(path).create()
    ds.run('echo ambig > ambig')
    ds.repo.tag("ambig")
    # Don't fail when "ambig" refers to both a file and revision.
    ds.rerun(since="", revision="ambig", branch="rerun")
    eq_(len(ds.repo.repo.git.rev_list("rerun").split()),
        len(ds.repo.repo.git.rev_list("ambig", "--").split()))
Exemple #19
0
def test_encoding(path):
    staged = OBSCURE_FILENAME + u'_staged'
    untracked = OBSCURE_FILENAME + u'_untracked'
    ds = Dataset(path).create(force=True)
    ds.repo.add(staged)
    assert_repo_status(ds.path, added=[staged], untracked=[untracked])
    ds.save(updated=True)
    assert_repo_status(ds.path, untracked=[untracked])
Exemple #20
0
def make_demo_hierarchy_datasets(path, tree, parent=None):
    if parent is None:
        parent = Dataset(path).create(force=True)
    for node, items in tree.items():
        if isinstance(items, dict):
            node_path = opj(path, node)
            nodeds = parent.create(node_path, force=True)
            make_demo_hierarchy_datasets(node_path, items, parent=nodeds)
    return parent
Exemple #21
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            if_dirty='save-before'):

        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = dataset.path if isinstance(dataset, Dataset) else dataset
        content_by_ds, unavailable_paths = Interface._prep(
            path=path,
            dataset=dataset,
            recursive=recursive)
        if unavailable_paths:
            lgr.warning('ignored non-installed paths: %s', unavailable_paths)
        # upfront sanity and compliance checks
        if path_is_under(content_by_ds.keys()):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")
        # check that we have no top-level datasets and not files to process
        args_ok = True
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            if ds_path not in paths:
                lgr.error(
                    "will not act on files at %s (consider the `drop` command)",
                    paths)
                args_ok = False
            if not ds.get_superdataset(
                    datalad_only=False,
                    topmost=False):
                lgr.error(
                    "will not uninstall top-level dataset at %s (consider the `remove` command)",
                    ds.path)
                args_ok = False
        if not args_ok:
            raise ValueError(
                'inappropriate arguments, see previous error message(s)')

        handle_dirty_datasets(
            content_by_ds, mode=if_dirty, base=dataset)

        results = []

        # iterate over all datasets, starting at the bottom
        # to deinit contained submodules first
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            results.extend(
                # we confirmed the super dataset presence above
                _uninstall_dataset(ds, check=check, has_super=True))
        # there is nothing to save at the end
        return results
Exemple #22
0
def test_bf2043p2(path):
    ds = Dataset(path).create(force=True)
    ds.add('staged', save=False)
    ok_clean_git(ds.path, head_modified=['staged'], untracked=['untracked'])
    # plain save does not commit untracked content
    # this tests the second issue in #2043
    with chpwd(path):
        save()
    ok_clean_git(ds.path, untracked=['untracked'])
Exemple #23
0
def test_bf3285(path):
    ds = Dataset(path).create(force=True)
    # Note: Using repo.pathobj matters in the "TMPDIR=/var/tmp/sym\ link" case
    # because assert_repo_status is based off of {Annex,Git}Repo.path, which is
    # the realpath'd path (from the processing in _flyweight_id_from_args).
    subds = create(ds.repo.pathobj.joinpath("subds"))
    # Explicitly saving a path does not save an untracked, unspecified
    # subdataset.
    ds.save("foo")
    assert_repo_status(ds.path, untracked=[subds.path])
Exemple #24
0
def test_py2_unicode_command(path):
    # Avoid OBSCURE_FILENAME to avoid windows-breakage (gh-2929).
    ds = Dataset(path).create()
    touch_cmd = "import sys; open(sys.argv[1], 'w').write('')"
    cmd_str = u"{} -c \"{}\" {}".format(sys.executable,
                                        touch_cmd,
                                        u"bβ0.dat")
    ds.run(cmd_str)
    assert_repo_status(ds.path)
    ok_exists(op.join(path, u"bβ0.dat"))

    if not on_windows:  # FIXME
        ds.run([sys.executable, "-c", touch_cmd, u"bβ1.dat"])
        assert_repo_status(ds.path)
        ok_exists(op.join(path, u"bβ1.dat"))

        # Send in a list of byte-strings to mimic a py2 command-line
        # invocation.
        ds.run([s.encode("utf-8")
                for s in [sys.executable, "-c", touch_cmd, u" β1 "]])
        assert_repo_status(ds.path)
        ok_exists(op.join(path, u" β1 "))

    with assert_raises(CommandError), swallow_outputs():
        ds.run(u"bβ2.dat")
Exemple #25
0
def test_save_message_file(path):
    ds = Dataset(path).create()
    with assert_raises(ValueError):
        ds.save("blah", message="me", message_file="and me")

    create_tree(path, {"foo": "x",
                       "msg": "add foo"})
    ds.repo.add("foo")
    ds.save(message_file=op.join(ds.path, "msg"))
    eq_(ds.repo.repo.git.show("--format=%s", "--no-patch"),
        "add foo")
Exemple #26
0
def test_create_curdir(path, path2):
    with chpwd(path, mkdir=True):
        create()
    ds = Dataset(path)
    ok_(ds.is_installed())
    assert_repo_status(ds.path, annex=True)

    with chpwd(path2, mkdir=True):
        create(no_annex=True)
    ds = Dataset(path2)
    ok_(ds.is_installed())
    assert_repo_status(ds.path, annex=False)
    ok_(op.exists(op.join(ds.path, '.noannex')))
Exemple #27
0
def test_nested_create(path):
    # to document some more organic usage pattern
    ds = Dataset(path).create()
    assert_repo_status(ds.path)
    lvl2relpath = op.join('lvl1', 'lvl2')
    lvl2path = op.join(ds.path, lvl2relpath)
    os.makedirs(lvl2path)
    os.makedirs(op.join(ds.path, 'lvl1', 'empty'))
    with open(op.join(lvl2path, 'file'), 'w') as f:
        f.write('some')
    ok_(ds.save())
    # Empty directories are filtered out.
    assert_repo_status(ds.path, untracked=[])
    # later create subdataset in a fresh dir
    # WINDOWS FAILURE IS NEXT LINE
    subds1 = ds.create(op.join('lvl1', 'subds'))
    assert_repo_status(ds.path, untracked=[])
    eq_(ds.subdatasets(result_xfm='relpaths'), [op.join('lvl1', 'subds')])
    # later create subdataset in an existing empty dir
    subds2 = ds.create(op.join('lvl1', 'empty'))
    assert_repo_status(ds.path)
    # later try to wrap existing content into a new subdataset
    # but that won't work
    assert_in_results(
        ds.create(lvl2relpath, **raw),
        status='error',
        message=(
            'collision with content in parent dataset at %s: %s',
            ds.path, [op.join(lvl2path, 'file')]))
    # even with force, as to do this properly complicated surgery would need to
    # take place
    # MIH disable shaky test till proper dedicated upfront check is in-place in `create`
    # gh-1725
    #assert_in_results(
    #    ds.create(lvl2relpath, force=True,
    #              on_failure='ignore', result_xfm=None, result_filter=None),
    #    status='error', action='add')
    # only way to make it work is to unannex the content upfront
    ds.repo._run_annex_command('unannex', annex_options=[op.join(lvl2relpath, 'file')])
    # nothing to save, git-annex commits the unannex itself, but only on v5
    ds.repo.commit()
    # still nothing without force
    # "err='lvl1/lvl2' already exists in the index"
    assert_in_results(
        ds.create(lvl2relpath, **raw),
        status='error',
        message='will not create a dataset in a non-empty directory, use `force` option to ignore')
    # XXX even force doesn't help, because (I assume) GitPython doesn't update
    # its representation of the Git index properly
    ds.create(lvl2relpath, force=True)
    assert_in(lvl2relpath, ds.subdatasets(result_xfm='relpaths'))
Exemple #28
0
def _traverse_handle_subds(
        subds_rpath, rootds,
        recurse_datasets, recurse_directories, json):
    """A helper to deal with the subdataset node - recurse or just pick up
    may be alrady collected in it web meta
    """
    subds_path = opj(rootds.path, subds_rpath)
    subds = Dataset(subds_path)
    subds_json = metadata_locator(path='.', ds_path=subds_path)

    def handle_not_installed():
        # for now just traverse as fs
        lgr.warning("%s is either not installed or lacks meta-data", subds)
        subfs = fs_extract(subds_path, rootds, basepath=rootds.path)
        # but add a custom type that it is a not installed subds
        subfs['type'] = 'uninitialized'
        # we need to kick it out from 'children'
        # TODO:  this is inefficient and cruel -- "ignored" should be made
        # smarted to ignore submodules for the repo
        #if fs['nodes']:
        #    fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath]
        return subfs

    if not subds.is_installed():
        subfs = handle_not_installed()
    elif recurse_datasets:
        subfs = ds_traverse(subds,
                            json=json,
                            recurse_datasets=recurse_datasets,
                            recurse_directories=recurse_directories,
                            parent=rootds)
        subfs.pop('nodes', None)
        #size_list.append(subfs['size'])
    # else just pick the data from metadata_file of each subdataset
    else:
        subfs = None
        lgr.info(subds.path)
        if exists(subds_json):
            with open(subds_json) as data_file:
                subfs = js.load(data_file)
                subfs.pop('nodes', None)  # remove children
                subfs['path'] = subds_rpath  # reassign the path
                #size_list.append(subfs['size'])
        else:
            # the same drill as if not installed
            lgr.warning("%s is installed but no meta-data yet", subds)
            subfs = handle_not_installed()
    # add URL field

    return subfs
Exemple #29
0
def test_save_directory(path):
    # Sequence of save invocations on subdirectories.
    ds = Dataset(path).create(force=True)
    ds._save(path='sdir1')
    ok_clean_git(ds.path, untracked=['sdir2/foo', 'sdir3/sdir/subsub/foo'])

    # There is also difference from
    with chpwd(path):
        save(path='sdir2')
    ok_clean_git(ds.path, untracked=['sdir3/sdir/subsub/foo'])

    with chpwd(opj(path, 'sdir3')):
        save(path='sdir')
    ok_clean_git(ds.path)
Exemple #30
0
def test_run_explicit(path):
    ds = Dataset(path)

    assert_false(ds.repo.file_has_content("test-annex.dat"))

    create_tree(ds.path, {"dirt_untracked": "untracked",
                          "dirt_modified": "modified"})
    ds.save("dirt_modified", to_git=True)
    with open(op.join(path, "dirt_modified"), "a") as ofh:
        ofh.write(", more")

    # We need explicit=True to run with dirty repo.
    assert_status("impossible",
                  ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
                         inputs=["test-annex.dat"],
                         on_failure="ignore"))

    hexsha_initial = ds.repo.get_hexsha()
    # If we specify test-annex.dat as an input, it will be retrieved before the
    # run.
    ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
           inputs=["test-annex.dat"], explicit=True)
    ok_(ds.repo.file_has_content("test-annex.dat"))
    # We didn't commit anything because outputs weren't specified.
    assert_false(ds.repo.file_has_content("doubled.dat"))
    eq_(hexsha_initial, ds.repo.get_hexsha())

    # If an input doesn't exist, we just show the standard warning.
    with swallow_logs(new_level=logging.WARN) as cml:
        with swallow_outputs():
            ds.run("ls", inputs=["not-there"], explicit=True)
        assert_in("Input does not exist: ", cml.out)

    remove(op.join(path, "doubled.dat"))

    hexsha_initial = ds.repo.get_hexsha()
    ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
           inputs=["test-annex.dat"], outputs=["doubled.dat"],
           explicit=True)
    ok_(ds.repo.file_has_content("doubled.dat"))
    assert_repo_status(ds.path, modified=["dirt_modified"], untracked=['dirt_untracked'])
    neq_(hexsha_initial, ds.repo.get_hexsha())

    # Saving explicit outputs works from subdirectories.
    subdir = op.join(path, "subdir")
    mkdir(subdir)
    with chpwd(subdir):
        run("echo insubdir >foo", explicit=True, outputs=["foo"])
    ok_(ds.repo.file_has_content(op.join("subdir", "foo")))
def test_force_checkdatapresent(srcpath, dstpath):
    src = Dataset(srcpath).create()
    target = mk_push_target(src, 'target', dstpath, annex=True, bare=True)
    (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.")
    src.save(to_git=False, message="New annex file")
    assert_repo_status(src.path, annex=True)
    whereis_prior = src.repo.whereis(files=['test_mod_annex_file'])[0]

    res = src.push(to='target', data='nothing')
    # nothing reported to be copied
    assert_not_in_results(res, action='copy')
    # we got the git-push nevertheless
    eq_(src.repo.get_hexsha(DEFAULT_BRANCH), target.get_hexsha(DEFAULT_BRANCH))
    # nothing moved
    eq_(whereis_prior, src.repo.whereis(files=['test_mod_annex_file'])[0])

    # now a push without forced no-transfer
    # we do not give since, so the non-transfered file is picked up
    # and transferred
    res = src.push(to='target', force=None)
    # no branch change, done before
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec=DEFAULT_REFSPEC)
    # but availability update
    assert_in_results(res,
                      action='publish',
                      status='ok',
                      refspec='refs/heads/git-annex:refs/heads/git-annex')
    assert_in_results(res,
                      status='ok',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy')
    # whereis info reflects the change
    ok_(
        len(whereis_prior) < len(
            src.repo.whereis(files=['test_mod_annex_file'])[0]))

    # do it yet again will do nothing, because all is uptodate
    assert_status('notneeded', src.push(to='target', force=None))
    # an explicit reference point doesn't change that
    assert_status('notneeded', src.push(to='target',
                                        force=None,
                                        since='HEAD~1'))

    # now force data transfer
    res = src.push(to='target', force='checkdatapresent')
    # no branch change, done before
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec=DEFAULT_REFSPEC)
    # no availability update
    assert_in_results(res,
                      action='publish',
                      status='notneeded',
                      refspec='refs/heads/git-annex:refs/heads/git-annex')
    # but data transfer
    assert_in_results(res,
                      status='ok',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy')

    # force data transfer, but data isn't available
    src.repo.drop('test_mod_annex_file')
    res = src.push(to='target',
                   path='.',
                   force='checkdatapresent',
                   on_failure='ignore')
    assert_in_results(res,
                      status='impossible',
                      path=str(src.pathobj / 'test_mod_annex_file'),
                      action='copy',
                      message='Slated for transport, but no content present')
Exemple #32
0
def test_save_obscure_name(path):
    ds = Dataset(path).create(force=True)
    fname = OBSCURE_FILENAME
    # Just check that we don't fail with a unicode error.
    with swallow_outputs():
        ds.save(path=fname, result_renderer="default")
Exemple #33
0
def test_path_diff(_path, linkpath):
    # do the setup on the real path, not the symlink, to have its
    # bugs not affect this test of status()
    ds = get_deeply_nested_structure(str(_path))
    if has_symlink_capability():
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(_path, target_is_directory=True)
        path = linkpath
    else:
        path = _path

    ds = Dataset(path)
    if has_symlink_capability():
        assert ds.pathobj != ds.repo.pathobj

    plain_recursive = ds.diff(recursive=True,
                              annex='all',
                              result_renderer=None)
    # check integrity of individual reports with a focus on how symlinks
    # are reported
    for res in plain_recursive:
        # anything that is an "intended" symlink should be reported
        # as such. In contrast, anything that is a symlink for mere
        # technical reasons (annex using it for something in some mode)
        # should be reported as the thing it is representing (i.e.
        # a file)
        if 'link2' in str(res['path']):
            assert res['type'] == 'symlink', res
        else:
            assert res['type'] != 'symlink', res
        # every item must report its parent dataset
        assert_in('parentds', res)

    # bunch of smoke tests
    # query of '.' is same as no path
    eq_(plain_recursive,
        ds.diff(path='.', recursive=True, annex='all', result_renderer=None))
    # duplicate paths do not change things
    eq_(
        plain_recursive,
        ds.diff(path=['.', '.'],
                recursive=True,
                annex='all',
                result_renderer=None))
    # neither do nested paths
    if not "2.24.0" <= ds.repo.git_version < "2.25.0":
        # Release 2.24.0 contained a regression that was fixed with 072a231016
        # (2019-12-10).
        eq_(
            plain_recursive,
            ds.diff(path=['.', 'subds_modified'],
                    recursive=True,
                    annex='all',
                    result_renderer=None))
    # when invoked in a subdir of a dataset it still reports on the full thing
    # just like `git status`, as long as there are no paths specified
    with chpwd(op.join(path, 'directory_untracked')):
        plain_recursive = diff(recursive=True,
                               annex='all',
                               result_renderer=None)
    # should be able to take absolute paths and yield the same
    # output
    eq_(
        plain_recursive,
        ds.diff(path=ds.path,
                recursive=True,
                annex='all',
                result_renderer=None))

    # query for a deeply nested path from the top, should just work with a
    # variety of approaches
    rpath = op.join('subds_modified', 'subds_lvl1_modified',
                    u'{}_directory_untracked'.format(OBSCURE_FILENAME))
    apathobj = ds.pathobj / rpath
    apath = str(apathobj)
    for p in (rpath, apath, None):
        if p is None:
            # change into the realpath of the dataset and
            # query with an explicit path
            with chpwd(ds.path):
                res = ds.diff(path=op.join('.', rpath),
                              recursive=True,
                              annex='all',
                              result_renderer=None)
        else:
            res = ds.diff(path=p,
                          recursive=True,
                          annex='all',
                          result_renderer=None)
        assert_result_count(
            res,
            1,
            state='untracked',
            type='directory',
            refds=ds.path,
            # path always comes out a full path inside the queried dataset
            path=apath,
        )

    assert_result_count(ds.diff(recursive=True, result_renderer=None),
                        1,
                        path=apath)
    # limiting recursion will exclude this particular path
    assert_result_count(ds.diff(recursive=True,
                                recursion_limit=1,
                                result_renderer=None),
                        0,
                        path=apath)
    # negative limit is unlimited limit
    eq_(ds.diff(recursive=True, recursion_limit=-1, result_renderer=None),
        ds.diff(recursive=True, result_renderer=None))
Exemple #34
0
def test_sidecar(path):
    ds = Dataset(path).create()
    # Simple sidecar message checks.
    ds.run("cd .> dummy0", message="sidecar arg", sidecar=True)
    assert_not_in('"cmd":', ds.repo.format_commit("%B"))

    ds.config.set("datalad.run.record-sidecar", "false", where="local")
    ds.run("cd .> dummy1", message="sidecar config")

    assert_in('"cmd":', last_commit_msg(ds.repo))

    ds.config.set("datalad.run.record-sidecar", "true", where="local")
    ds.run("cd .> dummy2", message="sidecar config")
    assert_not_in('"cmd":', last_commit_msg(ds.repo))

    # Don't break when config.get() returns multiple values. Here it's two
    # values in .gitconfig, but a more realistic scenario is a value in
    # $repo/.git/config that overrides a setting in ~/.config/git/config.
    ds.config.add("datalad.run.record-sidecar", "false", where="local")
    ds.run("cd .> dummy3", message="sidecar config")
    assert_in('"cmd":', last_commit_msg(ds.repo))

    # make sure sidecar file is committed when explicitly specifying outputs
    ds.run("cd .> dummy4",
           outputs=["dummy4"],
           sidecar=True,
           explicit=True,
           message="sidecar + specified outputs")
    assert_not_in('"cmd":', last_commit_msg(ds.repo))
    assert_repo_status(ds.path)
Exemple #35
0
def test_run_save_deletion(path):
    ds = Dataset(path).create(force=True)
    ds.save()
    ds.run("{} to_remove".format("del" if on_windows else "rm"))
    assert_repo_status(ds.path)
Exemple #36
0
def test_run_assume_ready(path):
    ds = Dataset(path).create()
    repo = ds.repo
    adjusted = repo.is_managed_branch()

    # --assume-ready=inputs

    (repo.pathobj / "f1").write_text("f1")
    ds.save()

    def cat_cmd(fname):
        return [
            sys.executable, "-c",
            "import sys; print(open(sys.argv[-1]).read())", fname
        ]

    assert_in_results(ds.run(cat_cmd("f1"), inputs=["f1"]),
                      action="get",
                      type="file")
    # Same thing, but without the get() call.
    assert_not_in_results(ds.run(cat_cmd("f1"),
                                 inputs=["f1"],
                                 assume_ready="inputs"),
                          action="get",
                          type="file")

    ds.drop("f1", check=False)
    if not adjusted:
        # If the input is not actually ready, the command will fail.
        with assert_raises(CommandError):
            ds.run(cat_cmd("f1"), inputs=["f1"], assume_ready="inputs")

    # --assume-ready=outputs

    def unlink_and_write_cmd(fname):
        # This command doesn't care whether the output file is unlocked because
        # it removes it ahead of time anyway.
        return [
            sys.executable, "-c",
            "import sys; import os; import os.path as op; "
            "f = sys.argv[-1]; op.lexists(f) and os.unlink(f); "
            "open(f, mode='w').write(str(sys.argv))", fname
        ]

    (repo.pathobj / "f2").write_text("f2")
    ds.save()

    res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"])
    if not adjusted:
        assert_in_results(res, action="unlock", type="file")
    # Same thing, but without the unlock() call.
    res = ds.run(unlink_and_write_cmd("f2"),
                 outputs=["f2"],
                 assume_ready="outputs")
    assert_not_in_results(res, action="unlock", type="file")

    # --assume-ready=both

    res = ds.run(unlink_and_write_cmd("f2"), outputs=["f2"], inputs=["f2"])
    assert_in_results(res, action="get", type="file")
    if not adjusted:
        assert_in_results(res, action="unlock", type="file")

    res = ds.run(unlink_and_write_cmd("f2"),
                 outputs=["f2"],
                 inputs=["f2"],
                 assume_ready="both")
    assert_not_in_results(res, action="get", type="file")
    assert_not_in_results(res, action="unlock", type="file")
Exemple #37
0
def test_ls_json(topdir):
    annex = AnnexRepo(topdir, create=True)
    ds = Dataset(topdir)
    # create some file and commit it
    open(opj(ds.path, 'subdsfile.txt'), 'w').write('123')
    ds.add(path='subdsfile.txt')
    ds.save("Hello!", version_tag=1)
    # add a subdataset
    ds.install('subds', source=topdir)

    git = GitRepo(opj(topdir, 'dir', 'subgit'), create=True)  # create git repo
    git.add(opj(topdir, 'dir', 'subgit', 'fgit.txt'),
            commit=True)  # commit to git to init git repo
    annex.add(opj(topdir, 'dir', 'subgit'),
              commit=True)  # add the non-dataset git repo to annex
    annex.add(opj(topdir, 'dir'), commit=True)  # add to annex (links)
    annex.drop(opj(topdir, 'dir', 'subdir', 'file2.txt'),
               options=['--force'])  # broken-link

    meta_dir = opj('.git', 'datalad', 'metadata')
    meta_path = opj(topdir, meta_dir)

    def get_metahash(*path):
        return hashlib.md5(opj(*path).encode('utf-8')).hexdigest()

    for all_ in [True, False]:
        for recursive in [True, False]:
            for state in ['file', 'delete']:
                with swallow_logs(), swallow_outputs():
                    ds = _ls_json(topdir,
                                  json=state,
                                  all_=all_,
                                  recursive=recursive)

                # subdataset should have its json created and deleted when all=True else not
                subds_metahash = get_metahash('/')
                subds_metapath = opj(topdir, 'subds', meta_dir, subds_metahash)
                assert_equal(exists(subds_metapath),
                             (state == 'file' and recursive))

                # root should have its json file created and deleted in all cases
                ds_metahash = get_metahash('/')
                ds_metapath = opj(meta_path, ds_metahash)
                assert_equal(exists(ds_metapath), state == 'file')

                # children should have their metadata json's created and deleted only when recursive=True
                child_metahash = get_metahash('dir', 'subdir')
                child_metapath = opj(meta_path, child_metahash)
                assert_equal(exists(child_metapath),
                             (state == 'file' and all_))

                # ignored directories should not have json files created in any case
                for subdir in [('.hidden'), ('dir', 'subgit')]:
                    child_metahash = get_metahash(*subdir)
                    assert_equal(exists(opj(meta_path, child_metahash)), False)

                # check if its updated in its nodes sublist too. used by web-ui json. regression test
                assert_equal(ds['nodes'][0]['size']['total'],
                             ds['size']['total'])

                # check size of subdataset
                subds = [
                    item for item in ds['nodes']
                    if item['name'] == ('subdsfile.txt' or 'subds')
                ][0]
                assert_equal(subds['size']['total'], '3 Bytes')

                # run non-recursive dataset traversal after subdataset metadata already created
                # to verify sub-dataset metadata being picked up from its metadata file in such cases
                if state == 'file' and recursive and not all_:
                    ds = _ls_json(topdir, json='file', all_=False)
                    subds = [
                        item for item in ds['nodes']
                        if item['name'] == ('subdsfile.txt' or 'subds')
                    ][0]
                    assert_equal(subds['size']['total'], '3 Bytes')
Exemple #38
0
def test_basic_filemeta(path):
    with chpwd(path):
        # no repo -> error
        assert_status('error', metadata(on_failure='ignore'))
        # fine with annex
        AnnexRepo('.', create=True)
        _assert_metadata_empty(metadata()[0]['metadata'])
        _assert_metadata_empty(metadata('.')[0]['metadata'])

    # create playing field
    create_tree(path, {
        'somefile': 'content',
        'dir': {
            'deepfile': 'othercontent'
        }
    })
    ds = Dataset(path)
    res = ds.metadata('somefile', add=['plaintag'], on_failure='ignore')
    assert_result_count(res,
                        1,
                        status='impossible',
                        message="metadata not supported (only annex'ed files)",
                        path=opj(ds.path, 'somefile'))
    ds.add('.')
    ok_clean_git(path)
    # full query -> 2 files
    res = ds.metadata(reporton='files')
    assert_result_count(res, 2)
    assert_result_count(res, 2, type='file', metadata={})

    #
    # tags: just a special case of a metadata key without a value
    #
    # tag one file
    target_file = opj('dir', 'deepfile')
    # needs a sequence or dict
    assert_raises(ValueError, ds.metadata, target_file, add='mytag')
    # like this
    res = ds.metadata(target_file, add=['mytag'], reporton='files')
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        type='file',
                        path=opj(ds.path, target_file),
                        metadata={'tag': 'mytag'})
    # now init tag for all files that don't have one yet
    res = ds.metadata(init=['rest'], reporton='files')
    assert_result_count(res, 2)
    # from before
    assert_result_count(res,
                        1,
                        type='file',
                        path=opj(ds.path, target_file),
                        metadata={'tag': 'mytag'})
    # and the other one
    assert_result_count(res,
                        1,
                        type='file',
                        path=opj(ds.path, 'somefile'),
                        metadata={'tag': 'rest'})
    # add two more different tags
    res = ds.metadata(add=['other1', 'other2', 'other3'], reporton='files')
    assert_result_count(res, 2)
    for r in res:
        assert_in('other1', r['metadata']['tag'])
        assert_in('other2', r['metadata']['tag'])
        assert_in('other3', r['metadata']['tag'])

    # now remove two specifics tag from all files that exists in all files
    res = ds.metadata(remove=['other1', 'other3'], reporton='files')
    assert_result_count(res, 2)
    for r in res:
        assert_not_in('other1', r['metadata']['tag'])
        assert_in('other2', r['metadata']['tag'])

    # and now one that only exists in one file
    res = ds.metadata(remove=['rest'], reporton='files')
    # we still get 2 results, because we still touch all files
    assert_result_count(res, 2)
    # however there is no modification to files that don't have the tag
    assert_result_count(res,
                        1,
                        type='file',
                        path=opj(ds.path, 'somefile'),
                        metadata={'tag': 'other2'})
    assert_result_count(res,
                        1,
                        type='file',
                        path=opj(ds.path, target_file),
                        metadata={'tag': ['mytag', 'other2']})

    # and finally kill the tags
    res = ds.metadata(target_file, reset=['tag'], reporton='files')
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        type='file',
                        metadata={},
                        path=opj(ds.path, target_file))
    # no change to the other one
    assert_result_count(ds.metadata('somefile'),
                        1,
                        type='file',
                        path=opj(ds.path, 'somefile'),
                        metadata={'tag': 'other2'})
    # kill all tags everywhere
    res = ds.metadata(reset=['tag'], reporton='files')
    assert_result_count(res, 2)
    assert_result_count(res, 2, type='file', metadata={})

    #
    # key: value mapping
    #
    # invalid key -> exception
    assert_raises(ValueError,
                  ds.metadata,
                  'somefile',
                  add={'hd%aa': ('v1', 'v2')})
    #                  on_failure='ignore')
    # unknown key, rejected by default
    res = ds.metadata('somefile',
                      add=dict(new=('v1', 'v2')),
                      on_failure='ignore')
    assert_status('error', res)
    res = ds.metadata('somefile',
                      add=dict(new=('v1', 'v2')),
                      define_key=dict(new="something fresh"))
    assert_result_count(res, 1, metadata={'new': ['v1', 'v2']})
    # same as this, which exits to support the way things come
    # in from the cmdline
    res = ds.metadata(target_file, add=[['new', 'v1', 'v2']])
    assert_result_count(res, 1, metadata={'new': ['v1', 'v2']})
    # other file got the exact same metadata now
    assert_result_count(ds.metadata(), 2, metadata={'new': ['v1', 'v2']})
    # reset with just a key removes the entire mapping
    res = ds.metadata(target_file, reset=['new'])
    assert_result_count(res, 1, metadata={})
    # reset with a mapping, overrides the old one
    res = ds.metadata('somefile',
                      reset=dict(new='george', more='yeah'),
                      permit_undefined_keys=True)
    assert_result_count(res, 1, metadata=dict(new='george', more='yeah'))
    # remove single value from mapping, last value to go removes the key
    res = ds.metadata('somefile', remove=dict(more='yeah'))
    assert_result_count(res, 1, metadata=dict(new='george'))
    # and finally init keys
    res = ds.metadata(init=dict(new=['two', 'three'], super='fresh'),
                      permit_undefined_keys=True,
                      reporton='files')
    assert_result_count(res, 2)
    assert_result_count(
        res,
        1,
        path=opj(ds.path, target_file),
        # order of values is not maintained
        metadata=dict(new=['three', 'two'], super='fresh'))
    assert_result_count(
        res,
        1,
        path=opj(ds.path, 'somefile'),
        # order of values is not maintained
        metadata=dict(new='george', super='fresh'))
Exemple #39
0
def test_custom_native_merge(path):
    ds = Dataset(path).create(force=True)
    # no metadata, because nothing is commited
    _assert_metadata_empty(
        ds.metadata(reporton='datasets',
                    result_xfm='metadata',
                    return_type='item-or-list'))
    # enable BIDS metadata, BIDS metadata should become THE metadata
    ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset')
    ds.aggregate_metadata()
    # no metadata, because still nothing is commited
    _assert_metadata_empty(
        ds.metadata(reporton='datasets',
                    result_xfm='metadata',
                    return_type='item-or-list'))
    ds.add('.')
    ds.aggregate_metadata()
    meta = ds.metadata(reporton='datasets',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'myds', 'author': ['one', 'two']}, meta)
    # now give the ds a custom name, must override the native one
    # but authors still come from BIDS
    ds.metadata(apply2global=True, add=dict(name='mycustom'))
    meta = ds.metadata(reporton='datasets',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'mycustom', 'author': ['one', 'two']}, meta)
    # we can disable the merge
    meta = ds.metadata(reporton='datasets',
                       merge_native='none',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({'name': u'mycustom'}, meta)
    # we can accumulate values
    meta = ds.metadata(reporton='datasets',
                       merge_native='add',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal({
        'name': ['mycustom', 'myds'],
        'author': ['one', 'two']
    }, meta)
    # we can have native override custom (not sure when needed, though)
    # add one more custom to make visible
    ds.metadata(apply2global=True, init=dict(homepage='fresh'))
    meta = ds.metadata(reporton='datasets',
                       merge_native='reset',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal(
        {
            'name': u'myds',
            'author': ['one', 'two'],
            'homepage': u'fresh'
        }, meta)
    # enable an additional metadata source
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    # we need to reaggregate after the config change
    ds.aggregate_metadata(merge_native='add')
    meta = ds.metadata(reporton='datasets',
                       merge_native='add',
                       result_xfm='metadata',
                       return_type='item-or-list')
    _clean_meta(meta)
    assert_dict_equal(
        {
            'name': ['mycustom', 'myds', 'someother'],
            'author': ['one', 'two'],
            'homepage': u'fresh'
        }, meta)
Exemple #40
0
def test_mod_hierarchy(path):
    base = Dataset(path).create()
    sub = base.create('sub')
    basedb_path = opj(base.path, '.datalad', 'metadata', 'dataset.json')
    subdb_path = opj(sub.path, '.datalad', 'metadata', 'dataset.json')
    assert (not exists(basedb_path))
    assert (not exists(subdb_path))
    # modify sub through base
    res = base.metadata('sub', init=['tag1'], apply2global=True)
    # only sub modified
    assert_result_count(res, 3)
    assert_result_count(res,
                        1,
                        status='ok',
                        action='metadata',
                        metadata={'tag': 'tag1'})
    assert_result_count(res, 2, status='ok', action='save')
    assert (not exists(basedb_path))
    assert (exists(subdb_path))
    # saved all the way up
    ok_clean_git(base.path)
    # now again, different init, sub has tag already, should be spared
    res = base.metadata(init=['tag2'], apply2global=True)
    assert_result_count(res, 2)
    assert_result_count(res,
                        1,
                        status='ok',
                        action='metadata',
                        metadata={'tag': 'tag2'},
                        path=base.path)
    assert_result_count(res, 1, status='ok', action='save', path=base.path)

    # and again with removal of all metadata in sub
    ok_clean_git(base.path)
    # put to probe files so we see that nothing unrelated gets saved
    create_tree(base.path, {
        'probe': 'content',
        'sub': {
            'probe': 'othercontent'
        }
    })
    res = base.metadata('sub', reset=['tag'], apply2global=True)
    assert_result_count(res, 3)
    assert_result_count(res, 1, status='ok', action='metadata', metadata={})
    assert_result_count(res, 1, status='ok', action='save', path=base.path)
    assert (exists(basedb_path))
    assert (not exists(subdb_path))
    # when we remove the probe files things should be clean
    os.remove(opj(base.path, 'probe'))
    os.remove(opj(sub.path, 'probe'))
    ok_clean_git(base.path)

    # no uninstall the subdataset and check of errors are caught properly
    base.uninstall(sub.path)
    ok_clean_git(base.path)
    res = base.metadata('sub',
                        add=['mike1'],
                        apply2global=True,
                        on_failure='ignore')
    assert_result_count(res,
                        1,
                        status='error',
                        path=sub.path,
                        message='cannot edit metadata of unavailable dataset')
Exemple #41
0
def test_basic_dsmeta(path):
    ds = Dataset(path).create()
    ok_clean_git(path)
    # ensure clean slate
    res = ds.metadata(reporton='datasets')
    assert_result_count(res, 1)
    _assert_metadata_empty(res[0]['metadata'])
    # init
    res = ds.metadata(init=['tag1', 'tag2'], apply2global=True)
    eq_(res[0]['metadata']['tag'], ['tag1', 'tag2'])
    # init again does nothing
    res = ds.metadata(init=['tag3'], apply2global=True)
    eq_(res[0]['metadata']['tag'], ['tag1', 'tag2'])
    # reset whole key
    ds.metadata(reset=['tag'], apply2global=True)
    res = ds.metadata(reporton='datasets')
    assert_result_count(res, 1)
    _assert_metadata_empty(res[0]['metadata'])
    # add something arbitrary
    res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']),
                      apply2global=True,
                      on_failure='ignore')
    # fails due to unknown keys
    assert_status('error', res)
    res = ds.metadata(add=dict(dtype=['heavy'], readme=['short', 'long']),
                      define_key=dict(dtype='is_a_datatype',
                                      readme='is_readme_content'),
                      apply2global=True)

    eq_(res[0]['metadata']['dtype'], 'heavy')
    # sorted!
    eq_(res[0]['metadata']['readme'], ['long', 'short'])
    # check it reports common keys
    with swallow_outputs() as cmo:
        ds.metadata(show_keys=True)
        assert_in('license', cmo.out)
    # supply key definitions, no need for apply2global
    res = ds.metadata(define_key=dict(mykey='truth'))
    eq_(res[0]['metadata']['definition']['mykey'], u'truth')
    with swallow_outputs() as cmo:
        ds.metadata(show_keys=True)
        assert_in('mykey: truth (dataset: {})'.format(ds.path), cmo.out)
    # re-supply different key definitions -> error
    res = ds.metadata(define_key=dict(mykey='lie'), on_failure='ignore')
    assert_result_count(
        res,
        1,
        status='error',
        message=("conflicting definition for key '%s': '%s' != '%s'", "mykey",
                 "lie", "truth"))
    res = ds.metadata(define_key=dict(otherkey='altfact'), )
    eq_(res[0]['metadata']['definition']['otherkey'], 'altfact')
    # 'definition' is a regular key, we can remove items
    res = ds.metadata(remove=dict(definition=['mykey']), apply2global=True)
    assert_dict_equal(
        res[0]['metadata']['definition'], {
            'otherkey': u'altfact',
            'readme': u'is_readme_content',
            'dtype': u'is_a_datatype'
        })
    res = ds.metadata(remove=dict(definition=['otherkey', 'readme', 'dtype']),
                      apply2global=True)
    # when there are no items left, the key vanishes too
    assert ('definition' not in res[0]['metadata'])
    # we still have metadata, so there is a DB file
    assert (res[0]['metadata'])
    db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json')
    assert (exists(db_path))
    ok_clean_git(ds.path)
    # but if we remove it, the file is gone
    res = ds.metadata(reset=['readme', 'dtype'], apply2global=True)
    eq_(res[0]['metadata'], {})
    assert (not exists(db_path))
    ok_clean_git(ds.path)
Exemple #42
0
def test_dry_run(path):
    ds = Dataset(path).create(force=True)

    # The dataset is reported as dirty, and the custom result render relays
    # that to the default renderer.
    with swallow_outputs() as cmo:
        with assert_raises(IncompleteResultsError):
            ds.run("blah ", dry_run="basic")
        assert_in("run(impossible)", cmo.out)
        assert_not_in("blah", cmo.out)

    ds.save()

    with swallow_outputs() as cmo:
        ds.run("blah ", dry_run="basic")
        assert_in("Dry run", cmo.out)
        assert_in("location", cmo.out)
        assert_in("blah", cmo.out)
        assert_not_in("expanded inputs", cmo.out)
        assert_not_in("expanded outputs", cmo.out)

    with swallow_outputs() as cmo:
        ds.run("blah {inputs} {outputs}",
               dry_run="basic",
               inputs=["fo*"],
               outputs=["b*r"])
        assert_in('blah "foo" "bar"' if on_windows else "blah foo bar",
                  cmo.out)
        assert_in("expanded inputs", cmo.out)
        assert_in("['foo']", cmo.out)
        assert_in("expanded outputs", cmo.out)
        assert_in("['bar']", cmo.out)

    # Just the command.
    with swallow_outputs() as cmo:
        ds.run("blah ", dry_run="command")
        assert_not_in("Dry run", cmo.out)
        assert_in("blah", cmo.out)
        assert_not_in("inputs", cmo.out)

    # The output file wasn't unlocked.
    assert_repo_status(ds.path)

    # Subdaset handling

    subds = ds.create("sub")
    (subds.pathobj / "baz").write_text("z")
    ds.save(recursive=True)

    # If a subdataset is installed, it works as usual.
    with swallow_outputs() as cmo:
        ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"])
        assert_in('blah "sub\\baz"' if on_windows else 'blah sub/baz', cmo.out)

    # However, a dry run will not do the install/reglob procedure.
    ds.uninstall("sub", check=False)
    with swallow_outputs() as cmo:
        ds.run("blah {inputs}", dry_run="basic", inputs=["sub/b*"])
        assert_in("sub/b*", cmo.out)
        assert_not_in("baz", cmo.out)
Exemple #43
0
    def __call__(dataset, urlfile, urlformat, filenameformat,
                 input_type="ext", exclude_autometa=None, meta=None,
                 message=None, dry_run=False, fast=False, ifexists=None,
                 missing_value=None, save=True):
        # Temporarily work around gh-2269.
        url_file, url_format, filename_format = urlfile, urlformat, filenameformat

        import logging
        import os

        from requests.exceptions import RequestException

        from datalad.distribution.add import Add
        from datalad.distribution.create import Create
        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.dochelpers import exc_str
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")


        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type,
                                         url_format, filename_format,
                                         exclude_autometa, meta,
                                         dry_run,
                                         missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s",
                         row["url"], os.path.join(dataset.path, row["filename"]))
                lgr.info("Metadata: %s",
                         sorted(u"{}={}".format(k, v)
                                for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None, return_type='generator', save=save):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning(
                    "Not creating subdataset at existing path: %s",
                    spath)
            else:
                for r in dataset.create(spath, result_xfm=None,
                                        return_type='generator', save=save):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path, row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({"filename_abs": filename_abs,
                        "ds": ds_current,
                        "ds_filename": ds_filename})

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            for r in dataset.add(files_to_add, message=msg, save=save):
                yield r

            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r
def test_push_wanted(srcpath, dstpath):
    src = Dataset(srcpath).create()

    if src.repo.is_managed_branch():
        # on crippled FS post-update hook enabling via create-sibling doesn't
        # work ATM
        raise SkipTest("no create-sibling on crippled FS")
    (src.pathobj / 'data.0').write_text('0')
    (src.pathobj / 'secure.1').write_text('1')
    (src.pathobj / 'secure.2').write_text('2')
    src.save()

    # Dropping a file to mimic a case of simply not having it locally (thus not
    # to be "pushed")
    src.drop('secure.2', check=False)

    # Annotate sensitive content, actual value "verysecure" does not matter in
    # this example
    src.repo.set_metadata(add={'distribution-restrictions': 'verysecure'},
                          files=['secure.1', 'secure.2'])

    src.create_sibling(
        dstpath,
        annex_wanted="not metadata=distribution-restrictions=*",
        name='target',
    )
    # check that wanted is obeyed, since set in sibling configuration
    res = src.push(to='target')
    assert_in_results(res,
                      action='copy',
                      path=str(src.pathobj / 'data.0'),
                      status='ok')
    for p in ('secure.1', 'secure.2'):
        assert_not_in_results(res, path=str(src.pathobj / p))
    assert_status('notneeded', src.push(to='target'))

    # check the target to really make sure
    dst = Dataset(dstpath)
    # normal file, yes
    eq_((dst.pathobj / 'data.0').read_text(), '0')
    # secure file, no
    if dst.repo.is_managed_branch():
        neq_((dst.pathobj / 'secure.1').read_text(), '1')
    else:
        assert_raises(FileNotFoundError, (dst.pathobj / 'secure.1').read_text)

    # reset wanted config, which must enable push of secure file
    src.repo.set_preferred_content('wanted', '', remote='target')
    res = src.push(to='target')
    assert_in_results(res, path=str(src.pathobj / 'secure.1'))
    eq_((dst.pathobj / 'secure.1').read_text(), '1')
Exemple #45
0
def test_run_explicit(path):
    ds = Dataset(path)

    assert_false(ds.repo.file_has_content("test-annex.dat"))

    create_tree(ds.path, {
        "dirt_untracked": "untracked",
        "dirt_modified": "modified"
    })
    ds.save("dirt_modified", to_git=True)
    with open(op.join(path, "dirt_modified"), "a") as ofh:
        ofh.write(", more")

    # We need explicit=True to run with dirty repo.
    assert_status(
        "impossible",
        ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
               inputs=["test-annex.dat"],
               on_failure="ignore"))

    hexsha_initial = ds.repo.get_hexsha()
    # If we specify test-annex.dat as an input, it will be retrieved before the
    # run.
    ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
           inputs=["test-annex.dat"],
           explicit=True)
    ok_(ds.repo.file_has_content("test-annex.dat"))
    # We didn't commit anything because outputs weren't specified.
    assert_false(ds.repo.file_has_content("doubled.dat"))
    eq_(hexsha_initial, ds.repo.get_hexsha())

    # If an input doesn't exist, we just show the standard warning.
    with assert_raises(IncompleteResultsError):
        ds.run("ls", inputs=["not-there"], explicit=True, on_failure="stop")

    remove(op.join(path, "doubled.dat"))

    hexsha_initial = ds.repo.get_hexsha()
    ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
           inputs=["test-annex.dat"],
           outputs=["doubled.dat"],
           explicit=True)
    ok_(ds.repo.file_has_content("doubled.dat"))
    assert_repo_status(ds.path,
                       modified=["dirt_modified"],
                       untracked=['dirt_untracked'])
    neq_(hexsha_initial, ds.repo.get_hexsha())

    # Saving explicit outputs works from subdirectories.
    subdir = op.join(path, "subdir")
    mkdir(subdir)
    with chpwd(subdir):
        run("echo insubdir >foo", explicit=True, outputs=["foo"])
    ok_(ds.repo.file_has_content(op.join("subdir", "foo")))
Exemple #46
0
def test_remove_uninstalled(path=None):
    ds = Dataset(path)
    assert_raises(ValueError, ds.remove)
Exemple #47
0
def test_run_from_subds(path):
    subds = Dataset(path).create().create("sub")
    subds.run("cd .> foo")
    assert_repo_status(subds.path)
Exemple #48
0
def test_clean_subds_removal(path=None):
    ds = Dataset(path).create()
    subds1 = ds.create('one')
    subds2 = ds.create('two')
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['one', 'two'])
    assert_repo_status(ds.path)
    # now kill one
    res = ds.remove('one', reckless='availability', result_xfm=None)
    # subds1 got uninstalled, and ds got the removal of subds1 saved
    assert_result_count(res,
                        1,
                        path=subds1.path,
                        action='uninstall',
                        status='ok')
    assert_result_count(res, 1, path=subds1.path, action='remove', status='ok')
    assert_result_count(res, 1, path=ds.path, action='save', status='ok')
    ok_(not subds1.is_installed())
    assert_repo_status(ds.path)
    # two must remain
    eq_(ds.subdatasets(result_xfm='relpaths'), ['two'])
    # one is gone
    nok_(subds1.pathobj.exists())
    # and now again, but this time remove something that is not installed
    ds.create('three')
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two'])
    ds.drop('two', what='all', reckless='availability')
    assert_repo_status(ds.path)
    eq_(sorted(ds.subdatasets(result_xfm='relpaths')), ['three', 'two'])
    nok_(subds2.is_installed())
    # oderly empty mountpoint is maintained
    ok_(subds2.pathobj.exists())
    res = ds.remove('two', reckless='availability')
    assert_in_results(res, path=str(ds.pathobj / 'two'), action='remove')
    assert_repo_status(ds.path)
    # subds2 was already uninstalled, now ds got the removal of subds2 saved
    nok_(subds2.pathobj.exists())
    eq_(ds.subdatasets(result_xfm='relpaths'), ['three'])
Exemple #49
0
def get_modified_subpaths(aps,
                          refds,
                          revision,
                          recursion_limit=None,
                          report_no_revision_change=True,
                          report_untracked='all'):
    """
    Parameters
    ----------
    aps : list
    refds : Dataset
    revision : str
      Commit-ish
    """
    from datalad.interface.diff import Diff

    # TODO needs recursion limit
    # NOTE this is implemented as a generator despite that fact that we need
    # to sort through _all_ the inputs initially, diff'ing each involved
    # dataset takes time that we can use to already act on intermediate
    # result paths, without having to wait for 100% completion
    if revision is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        for r in aps:
            yield r

    # life is simple: we diff the base dataset
    modified = []
    for r in refds.diff(
            # we cannot really limit the diff paths easily because we might get
            # or miss content (e.g. subdatasets) if we don't figure out which
            # ones are known -- and we don't want that
            path=None,
            # `revision` can be anything that Git support for `diff`
            # `True` is code for diff without revision
            revision=revision if revision is not True else None,
            # it is important that staged is False, otherwise we would miss unstaged
            # changes when e.g. diffing against HEAD (save does that)
            staged=False,
            # we might want to consider putting 'untracked' here
            # maybe that is a little faster, not tested yet
            ignore_subdatasets='none',
            # by default, we want to see any individual untracked file, this simplifies further
            # processing dramatically, but may require subsequent filtering
            # in order to avoid flooding user output with useless info
            report_untracked=report_untracked,
            # no recursion, we needs to update `revision` for every subdataset
            # before we can `diff`
            recursive=False,
            return_type='generator',
            result_renderer=None,
            # need to be able to yield the errors
            on_failure='ignore'):
        if r['status'] in ('impossible', 'error'):
            # something unexpected, tell daddy
            yield r
            continue
        # if asked, and no change in revision -- skip
        if not report_no_revision_change \
                and (r.get('revision_src') or r.get('revision')) \
                and (r.get('revision_src') == r.get('revision')):
            continue
        r['status'] = ''
        modified.append(r)

    if not len(modified):
        # nothing modified nothing to report
        return

    # now we can grab the APs that are in this dataset and yield them
    for ap in aps:
        # need to preserve pristine info first
        ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for m in modified:
            if ap['path'] == m['path']:
                # is directly modified, yield input AP
                # but update with what we learned about the modification
                ap.update(m)
                yield ap
                break
            if path_is_subpath(m['path'], ap['path']):
                # a modified path is underneath this AP
                # yield the modified one instead
                yield m
                continue

    mod_subs = [m for m in modified if m.get('type', None) == 'dataset']
    if not mod_subs or (recursion_limit is not None and recursion_limit < 1):
        return

    aps = [
        ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for ap in aps
    ]
    # now for all submodules that were found modified
    for sub in [m for m in modified if m.get('type', None) == 'dataset']:
        sub_path_ = _with_sep(sub['path'])
        # these AP match something inside this submodule, or the whole submodule
        sub_aps = [
            ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)
        ]
        if not sub_aps:
            continue
        # we are interested in the modifications within this subdataset
        # from the state we previously had on record, till the state
        # we have in record now
        diff_range = '{}..{}'.format(
            sub['revision_src'] if sub['revision_src'] else
            PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '')
        if sub['revision_src'] and sub['revision_src'] == sub['revision']:
            # this is a special case, where subdataset reported changes without
            # a change in state/commit -- this is code for uncommited changes
            # in the subdataset (including staged ones). In such a case, we
            # must not provide a diff range, but only the source commit we want
            # to diff against
            # XXX if this is changed, likely the same logic in diff needs
            # changing too!
            diff_range = sub['revision_src']

        for r in get_modified_subpaths(
                sub_aps,
                Dataset(sub['path']),
                diff_range,
                recursion_limit=(recursion_limit -
                                 1) if recursion_limit is not None else None):
            yield r
Exemple #50
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 action=None,
                 unavailable_path_status='',
                 unavailable_path_msg=None,
                 nondataset_path_status='error',
                 force_parentds_discovery=True,
                 force_subds_discovery=True,
                 force_no_revision_change_discovery=True,
                 force_untracked_discovery=True,
                 modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None
                                     or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)"
            )

        # prep common result props
        res_kwargs = dict(action=action if action else 'annotate_path',
                          refds=refds_path,
                          logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(refds, refds_path, action,
                                         recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(**dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [
                    preserved_paths.append(r) for r in requested_paths
                    if not lexists(r['path'] if isinstance(r, dict) else r)
                ]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(
                    opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or
                                           (refds_path
                                            and _with_sep(oneupdir).startswith(
                                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(
                            normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                (path_type == 'dataset'
                 and 'registered_subds' not in path_props)
                    or path_type == 'directory' or not lexists(path)):
                from datalad.distribution.subdatasets import Subdatasets
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(fulfilled=None,
                                                   recursive=False,
                                                   result_xfm=None,
                                                   result_filter=None,
                                                   return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get('status',
                                               unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(
                        parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action,
                                         recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=
                        force_no_revision_change_discovery,
                        report_untracked='all'
                        if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Exemple #51
0
def test_repo_diff(path, norepo):
    ds = Dataset(path).create()
    assert_repo_status(ds.path)
    assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE')

    if ds.repo.is_managed_branch():
        fr_base = DEFAULT_BRANCH
        to = DEFAULT_BRANCH
    else:
        fr_base = "HEAD"
        to = None

    # no diff
    eq_(ds.repo.diff(fr_base, to), {})
    # bogus path makes no difference
    eq_(ds.repo.diff(fr_base, to, paths=['THIS']), {})
    # let's introduce a known change
    create_tree(ds.path, {'new': 'empty'})
    ds.save(to_git=True)
    assert_repo_status(ds.path)
    eq_(
        ds.repo.diff(fr=fr_base + '~1', to=fr_base), {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'added',
                'type': 'file',
                'bytesize': 5,
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # modify known file
    create_tree(ds.path, {'new': 'notempty'})
    eq_(
        ds.repo.diff(fr='HEAD', to=None),
        {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'modified',
                'type': 'file',
                # the beast is modified, but no change in shasum -> not staged
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6',
                'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # per path query gives the same result
    eq_(ds.repo.diff(fr=fr_base, to=to),
        ds.repo.diff(fr=fr_base, to=to, paths=['new']))
    # also given a directory as a constraint does the same
    eq_(ds.repo.diff(fr=fr_base, to=to),
        ds.repo.diff(fr=fr_base, to=to, paths=['.']))
    # but if we give another path, it doesn't show up
    eq_(ds.repo.diff(fr=fr_base, to=to, paths=['other']), {})

    # make clean
    ds.save()
    assert_repo_status(ds.path)

    # untracked stuff
    create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}})
    # default is to report all files
    eq_(
        ds.repo.diff(fr='HEAD', to=None), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
    # but can be made more compact
    eq_(
        ds.repo.diff(fr='HEAD', to=None, untracked='normal'), {
            ut.Path(ds.repo.pathobj / 'deep'): {
                'state': 'untracked',
                'type': 'directory'
            }
        })

    # again a unmatching path constrainted will give an empty report
    eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {})
    # perfect match and anything underneath will do
    eq_(
        ds.repo.diff(fr='HEAD', to=None, paths=['deep']), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
Exemple #52
0
def test_diff(path, norepo):
    with chpwd(norepo):
        assert_raises(NoDatasetFound, diff)
    ds = Dataset(path).create()
    assert_repo_status(ds.path)
    # reports stupid revision input
    assert_result_count(ds.diff(fr='WTF',
                                on_failure='ignore',
                                result_renderer=None),
                        1,
                        status='impossible',
                        message="Git reference 'WTF' invalid")
    # no diff
    assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0)
    assert_result_count(
        _dirty_results(ds.diff(fr='HEAD', result_renderer=None)), 0)
    # bogus path makes no difference
    assert_result_count(
        _dirty_results(ds.diff(path='THIS', fr='HEAD', result_renderer=None)),
        0)
    # let's introduce a known change
    create_tree(ds.path, {'new': 'empty'})
    ds.save(to_git=True)
    assert_repo_status(ds.path)

    if ds.repo.is_managed_branch():
        fr_base = DEFAULT_BRANCH
        to = DEFAULT_BRANCH
    else:
        fr_base = "HEAD"
        to = None

    res = _dirty_results(
        ds.diff(fr=fr_base + '~1', to=to, result_renderer=None))
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        action='diff',
                        path=op.join(ds.path, 'new'),
                        state='added')
    # we can also find the diff without going through the dataset explicitly
    with chpwd(ds.path):
        assert_result_count(_dirty_results(
            diff(fr=fr_base + '~1', to=to, result_renderer=None)),
                            1,
                            action='diff',
                            path=op.join(ds.path, 'new'),
                            state='added')
    # no diff against HEAD
    assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0)
    # modify known file
    create_tree(ds.path, {'new': 'notempty'})
    res = _dirty_results(ds.diff(result_renderer=None))
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        action='diff',
                        path=op.join(ds.path, 'new'),
                        state='modified')
    # but if we give another path, it doesn't show up
    assert_result_count(ds.diff(path='otherpath', result_renderer=None), 0)
    # giving the right path must work though
    assert_result_count(ds.diff(path='new', result_renderer=None),
                        1,
                        action='diff',
                        path=op.join(ds.path, 'new'),
                        state='modified')
    # stage changes
    ds.repo.add('.', git=True)
    # no change in diff, staged is not committed
    assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 1)
    ds.save()
    assert_repo_status(ds.path)
    assert_result_count(_dirty_results(ds.diff(result_renderer=None)), 0)

    # untracked stuff
    create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}})
    # a plain diff should report the untracked file
    # but not directly, because the parent dir is already unknown
    res = _dirty_results(ds.diff(result_renderer=None))
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        state='untracked',
                        type='directory',
                        path=op.join(ds.path, 'deep'))
    # report of individual files is also possible
    assert_result_count(ds.diff(untracked='all', result_renderer=None),
                        2,
                        state='untracked',
                        type='file')
    # an unmatching path will hide this result
    assert_result_count(ds.diff(path='somewhere', result_renderer=None), 0)
    # perfect match and anything underneath will do
    assert_result_count(ds.diff(path='deep', result_renderer=None),
                        1,
                        state='untracked',
                        path=op.join(ds.path, 'deep'),
                        type='directory')
    assert_result_count(ds.diff(path='deep', result_renderer=None),
                        1,
                        state='untracked',
                        path=op.join(ds.path, 'deep'))
    ds.repo.add(op.join('deep', 'down2'), git=True)
    # now the remaining file is the only untracked one
    assert_result_count(ds.diff(result_renderer=None),
                        1,
                        state='untracked',
                        path=op.join(ds.path, 'deep', 'down'),
                        type='file')
Exemple #53
0
def test_diff_recursive(path):
    ds = Dataset(path).create()
    sub = ds.create('sub')
    # look at the last change, and confirm a dataset was added
    res = ds.diff(fr=DEFAULT_BRANCH + '~1',
                  to=DEFAULT_BRANCH,
                  result_renderer=None)
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=sub.path,
                        type='dataset')
    # now recursive
    res = ds.diff(recursive=True,
                  fr=DEFAULT_BRANCH + '~1',
                  to=DEFAULT_BRANCH,
                  result_renderer=None)
    # we also get the entire diff of the subdataset from scratch
    assert_status('ok', res)
    ok_(len(res) > 3)
    # one specific test
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=op.join(sub.path, '.datalad', 'config'))

    # now we add a file to just the parent
    create_tree(ds.path, {
        'onefile': 'tobeadded',
        'sub': {
            'twofile': 'tobeadded'
        }
    })
    res = ds.diff(recursive=True, untracked='all', result_renderer=None)
    assert_result_count(_dirty_results(res), 3)
    assert_result_count(res,
                        1,
                        action='diff',
                        state='untracked',
                        path=op.join(ds.path, 'onefile'),
                        type='file')
    assert_result_count(res,
                        1,
                        action='diff',
                        state='modified',
                        path=sub.path,
                        type='dataset')
    assert_result_count(res,
                        1,
                        action='diff',
                        state='untracked',
                        path=op.join(sub.path, 'twofile'),
                        type='file')
    # intentional save in two steps to make check below easier
    ds.save('sub', recursive=True)
    ds.save()
    assert_repo_status(ds.path)

    head_ref = DEFAULT_BRANCH if ds.repo.is_managed_branch() else 'HEAD'

    # look at the last change, only one file was added
    res = ds.diff(fr=head_ref + '~1', to=head_ref, result_renderer=None)
    assert_result_count(_dirty_results(res), 1)
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=op.join(ds.path, 'onefile'),
                        type='file')

    # now the exact same thing with recursion, must not be different from the
    # call above
    res = ds.diff(recursive=True,
                  fr=head_ref + '~1',
                  to=head_ref,
                  result_renderer=None)
    assert_result_count(_dirty_results(res), 1)
    # last change in parent
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=op.join(ds.path, 'onefile'),
                        type='file')

    if ds.repo.is_managed_branch():
        raise SkipTest(
            "Test assumption broken: https://github.com/datalad/datalad/issues/3818"
        )
    # one further back brings in the modified subdataset, and the added file
    # within it
    res = ds.diff(recursive=True,
                  fr=head_ref + '~2',
                  to=head_ref,
                  result_renderer=None)
    assert_result_count(_dirty_results(res), 3)
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=op.join(ds.path, 'onefile'),
                        type='file')
    assert_result_count(res,
                        1,
                        action='diff',
                        state='added',
                        path=op.join(sub.path, 'twofile'),
                        type='file')
    assert_result_count(res,
                        1,
                        action='diff',
                        state='modified',
                        path=sub.path,
                        type='dataset')
Exemple #54
0
 def get_baseline(p):
     ds = Dataset(p).create()
     sub = create(str(ds.pathobj / 'sub'))
     assert_repo_status(ds.path, untracked=['sub'])
     return ds
def check_push(annex, src_path, dst_path):
    # prepare src
    src = Dataset(src_path).create(annex=annex)
    src_repo = src.repo
    # push should not add branches to the local dataset
    orig_branches = src_repo.get_branches()
    assert_not_in('synced/' + DEFAULT_BRANCH, orig_branches)

    res = src.push(on_failure='ignore')
    assert_result_count(res, 1)
    assert_in_results(
        res,
        status='impossible',
        message='No push target given, and none could be auto-detected, '
        'please specify via --to')
    eq_(orig_branches, src_repo.get_branches())
    # target sibling
    target = mk_push_target(src, 'target', dst_path, annex=annex)
    eq_(orig_branches, src_repo.get_branches())

    res = src.push(to="target")
    eq_(orig_branches, src_repo.get_branches())
    assert_result_count(res, 2 if annex else 1)
    assert_in_results(res,
                      action='publish',
                      status='ok',
                      target='target',
                      refspec=DEFAULT_REFSPEC,
                      operations=['new-branch'])

    assert_repo_status(src_repo, annex=annex)
    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # configure a default merge/upstream target
    src.config.set('branch.{}.remote'.format(DEFAULT_BRANCH),
                   'target',
                   where='local')
    src.config.set('branch.{}.merge'.format(DEFAULT_BRANCH),
                   DEFAULT_BRANCH,
                   where='local')

    # don't fail when doing it again, no explicit target specification
    # needed anymore
    res = src.push()
    eq_(orig_branches, src_repo.get_branches())
    # and nothing is pushed
    assert_status('notneeded', res)

    assert_repo_status(src_repo, annex=annex)
    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # some modification:
    (src.pathobj / 'test_mod_file').write_text("Some additional stuff.")
    src.save(to_git=True, message="Modified.")
    (src.pathobj / 'test_mod_annex_file').write_text("Heavy stuff.")
    src.save(to_git=not annex, message="Modified again.")
    assert_repo_status(src_repo, annex=annex)

    # we could say since='HEAD~2' to make things fast, or we are lazy
    # and say since='^' to indicate the state of the tracking remote
    # which is the same, because we made to commits since the last push.
    res = src.push(to='target', since="^", jobs=2)
    assert_in_results(
        res,
        action='publish',
        status='ok',
        target='target',
        refspec=DEFAULT_REFSPEC,
        # we get to see what happened
        operations=['fast-forward'])
    if annex:
        # we got to see the copy result for the annexed files
        assert_in_results(res,
                          action='copy',
                          status='ok',
                          path=str(src.pathobj / 'test_mod_annex_file'))
        # we published, so we can drop and reobtain
        ok_(src_repo.file_has_content('test_mod_annex_file'))
        src_repo.drop('test_mod_annex_file')
        ok_(not src_repo.file_has_content('test_mod_annex_file'))
        src_repo.get('test_mod_annex_file')
        ok_(src_repo.file_has_content('test_mod_annex_file'))
        ok_file_has_content(src_repo.pathobj / 'test_mod_annex_file',
                            'Heavy stuff.')

    eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
        list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))
    if not (annex and src_repo.is_managed_branch()):
        # the following doesn't make sense in managed branches, because
        # a commit that could be amended is no longer the last commit
        # of a branch after a sync has happened (which did happen
        # during the last push above

        # amend and change commit msg in order to test for force push:
        src_repo.commit("amended", options=['--amend'])
        # push should be rejected (non-fast-forward):
        res = src.push(to='target', since='HEAD~2', on_failure='ignore')
        # fails before even touching the annex branch
        assert_in_results(res,
                          action='publish',
                          status='error',
                          target='target',
                          refspec=DEFAULT_REFSPEC,
                          operations=['rejected', 'error'])
        # push with force=True works:
        res = src.push(to='target', since='HEAD~2', force='gitpush')
        assert_in_results(res,
                          action='publish',
                          status='ok',
                          target='target',
                          refspec=DEFAULT_REFSPEC,
                          operations=['forced-update'])
        eq_(list(target.get_branch_commits_(DEFAULT_BRANCH)),
            list(src_repo.get_branch_commits_(DEFAULT_BRANCH)))

    # we do not have more branches than we had in the beginning
    # in particular no 'synced/<default branch>'
    eq_(orig_branches, src_repo.get_branches())
Exemple #56
0
def test_save(path):

    ds = Dataset(path)

    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("something")

    ds.repo.add("new_file.tst", git=True)
    ok_(ds.repo.dirty)

    ds.save(message="add a new file")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("modify")

    ok_(ds.repo.dirty)
    ds.save(message="modified new_file.tst")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # save works without ds and files given in the PWD
    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("rapunzel")
    with chpwd(path):
        save(message="love rapunzel")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # and also without `-a` when things are staged
    with open(op.join(path, "new_file.tst"), "w") as f:
        f.write("exotic")
    ds.repo.add("new_file.tst", git=True)
    with chpwd(path):
        save(message="love marsians")
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    files = ['one.txt', 'two.txt']
    for fn in files:
        with open(op.join(path, fn), "w") as f:
            f.write(fn)

    ds.save([op.join(path, f) for f in files])
    # superfluous call to save (alll saved it already), should not fail
    # but report that nothing was saved
    assert_status('notneeded', ds.save(message="set of new files"))
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # create subdataset
    subds = ds.create('subds')
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))
    # modify subds
    with open(op.join(subds.path, "some_file.tst"), "w") as f:
        f.write("something")
    subds.save()
    assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo))
    # ensure modified subds is committed
    ds.save()
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))

    # now introduce a change downstairs
    subds.create('someotherds')
    assert_repo_status(subds.path, annex=isinstance(subds.repo, AnnexRepo))
    ok_(ds.repo.dirty)
    # and save via subdataset path
    ds.save('subds', version_tag='new_sub')
    assert_repo_status(path, annex=isinstance(ds.repo, AnnexRepo))
    tags = ds.repo.get_tags()
    ok_(len(tags) == 1)
    eq_(tags[0], dict(hexsha=ds.repo.get_hexsha(), name='new_sub'))
    # fails when retagged, like git does
    res = ds.save(version_tag='new_sub', on_failure='ignore')
    assert_status('error', res)
    assert_result_count(res,
                        1,
                        action='save',
                        type='dataset',
                        path=ds.path,
                        message=('cannot tag this version: %s',
                                 "fatal: tag 'new_sub' already exists"))
def test_push_recursive(origin_path, src_path, dst_top, dst_sub,
                        dst_subnoannex, dst_subsub):
    # dataset with two submodules and one subsubmodule
    origin = Dataset(origin_path).create()
    origin_subm1 = origin.create('sub m')
    origin_subm1.create('subsub m')
    origin.create('subm noannex', annex=False)
    origin.save()
    assert_repo_status(origin.path)
    # prepare src as a fresh clone with all subdatasets checkout out recursively
    # running on a clone should make the test scenario more different than
    # test_push(), even for the pieces that should be identical
    top = Clone.__call__(source=origin.path, path=src_path)
    sub, subsub, subnoannex = top.get('.',
                                      recursive=True,
                                      get_data=False,
                                      result_xfm='datasets')

    target_top = mk_push_target(top, 'target', dst_top, annex=True)
    # subdatasets have no remote yet, so recursive publishing should fail:
    res = top.push(to="target", recursive=True, on_failure='ignore')
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      refspec=DEFAULT_REFSPEC,
                      operations=['new-branch'],
                      action='publish',
                      status='ok',
                      target='target')
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='error',
                          type='dataset',
                          path=d.path,
                          message=("Unknown target sibling '%s'.", 'target'))
    # now fix that and set up targets for the submodules
    target_sub = mk_push_target(sub, 'target', dst_sub, annex=True)
    target_subnoannex = mk_push_target(subnoannex,
                                       'target',
                                       dst_subnoannex,
                                       annex=False)
    target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True)

    # and same push call as above
    res = top.push(to="target", recursive=True)
    # topds skipped
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      action='publish',
                      status='notneeded',
                      target='target')
    # the rest pushed
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # all correspondig branches match across all datasets
    for s, d in zip(
        (top, sub, subnoannex, subsub),
        (target_top, target_sub, target_subnoannex, target_subsub)):
        eq_(list(s.repo.get_branch_commits_(DEFAULT_BRANCH)),
            list(d.get_branch_commits_(DEFAULT_BRANCH)))
        if s != subnoannex:
            eq_(list(s.repo.get_branch_commits_("git-annex")),
                list(d.get_branch_commits_("git-annex")))

    # rerun should not result in further pushes of the default branch
    res = top.push(to="target", recursive=True)
    assert_not_in_results(res, status='ok', refspec=DEFAULT_REFSPEC)
    assert_in_results(res, status='notneeded', refspec=DEFAULT_REFSPEC)

    if top.repo.is_managed_branch():
        raise SkipTest(
            'Save/status of subdataset with managed branches is an still '
            'unresolved issue')

    # now annex a file in subsub
    test_copy_file = subsub.pathobj / 'test_mod_annex_file'
    test_copy_file.write_text("Heavy stuff.")
    # save all the way up
    assert_status(('ok', 'notneeded'),
                  top.save(message='subsub got something', recursive=True))
    assert_repo_status(top.path)
    # publish straight up, should be smart by default
    res = top.push(to="target", recursive=True)
    # we see 3 out of 4 datasets pushed (sub noannex was left unchanged)
    for d in (top, sub, subsub):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # file content copied too
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(test_copy_file))
    # verify it is accessible, drop and bring back
    assert_status('ok', top.drop(str(test_copy_file)))
    ok_(not subsub.repo.file_has_content('test_mod_annex_file'))
    top.get(test_copy_file)
    ok_file_has_content(test_copy_file, 'Heavy stuff.')

    # make two modification
    (sub.pathobj / 'test_mod_annex_file').write_text('annex')
    (subnoannex.pathobj / 'test_mod_file').write_text('git')
    # save separately
    top.save(sub.pathobj, message='annexadd', recursive=True)
    top.save(subnoannex.pathobj, message='gitadd', recursive=True)
    # now only publish the latter one
    res = top.push(to="target", since='HEAD~1', recursive=True)
    # nothing copied, no reports on the other modification
    assert_not_in_results(res, action='copy')
    assert_not_in_results(res, path=sub.path)
    for d in (top, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
    # an unconditional push should now pick up the remaining changes
    res = top.push(to="target", recursive=True)
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(sub.pathobj / 'test_mod_annex_file'))
    assert_in_results(res,
                      status='ok',
                      type='dataset',
                      path=sub.path,
                      refspec=DEFAULT_REFSPEC)
    for d in (top, subnoannex, subsub):
        assert_in_results(res,
                          status='notneeded',
                          type='dataset',
                          path=d.path,
                          refspec=DEFAULT_REFSPEC)
Exemple #58
0
def test_bf1886(path):
    parent = Dataset(path).create()
    parent.create('sub')
    assert_repo_status(parent.path)
    # create a symlink pointing down to the subdataset, and add it
    os.symlink('sub', op.join(parent.path, 'down'))
    parent.save('down')
    assert_repo_status(parent.path)
    # now symlink pointing up
    os.makedirs(op.join(parent.path, 'subdir', 'subsubdir'))
    os.symlink(op.join(op.pardir, 'sub'), op.join(parent.path, 'subdir', 'up'))
    parent.save(op.join('subdir', 'up'))
    # 'all' to avoid the empty dir being listed
    assert_repo_status(parent.path, untracked_mode='all')
    # now symlink pointing 2xup, as in #1886
    os.symlink(op.join(op.pardir, op.pardir, 'sub'),
               op.join(parent.path, 'subdir', 'subsubdir', 'upup'))
    parent.save(op.join('subdir', 'subsubdir', 'upup'))
    assert_repo_status(parent.path)
    # simulatenously add a subds and a symlink pointing to it
    # create subds, but don't register it
    create(op.join(parent.path, 'sub2'))
    os.symlink(op.join(op.pardir, op.pardir, 'sub2'),
               op.join(parent.path, 'subdir', 'subsubdir', 'upup2'))
    parent.save(['sub2', op.join('subdir', 'subsubdir', 'upup2')])
    assert_repo_status(parent.path)
    # full replication of #1886: the above but be in subdir of symlink
    # with no reference dataset
    create(op.join(parent.path, 'sub3'))
    os.symlink(op.join(op.pardir, op.pardir, 'sub3'),
               op.join(parent.path, 'subdir', 'subsubdir', 'upup3'))
    # need to use absolute paths
    with chpwd(op.join(parent.path, 'subdir', 'subsubdir')):
        save([
            op.join(parent.path, 'sub3'),
            op.join(parent.path, 'subdir', 'subsubdir', 'upup3')
        ])
    assert_repo_status(parent.path)
Exemple #59
0
class ArchiveAnnexCustomRemote(AnnexCustomRemote):
    """Special custom remote allowing to obtain files from archives

     Archives must be under annex'ed themselves.
    """
    CUSTOM_REMOTE_NAME = "archive"
    SUPPORTED_SCHEMES = (
        AnnexCustomRemote._get_custom_scheme(CUSTOM_REMOTE_NAME), )
    # Since we support only 1 scheme here
    URL_SCHEME = SUPPORTED_SCHEMES[0]
    URL_PREFIX = URL_SCHEME + ":"

    AVAILABILITY = "local"
    COST = 500

    def __init__(self, annex, path=None, persistent_cache=True, **kwargs):
        super().__init__(annex)

        # MIH figure out what the following is all about
        # in particular path==None
        self.repo = Dataset(get_dataset_root(Path.cwd())).repo \
            if not path \
            else AnnexRepo(path, create=False, init=False)

        self.path = self.repo.path
        # annex requests load by KEY not but URL which it originally asked
        # about.  So for a key we might get back multiple URLs and as a
        # heuristic let's use the most recently asked one

        self._last_url = None  # for heuristic to choose among multiple URLs
        self._cache = ArchivesCache(self.path, persistent=persistent_cache)
        self._contentlocations = DictCache(size_limit=100)  # TODO: config ?

    def stop(self, *args):
        """Stop communication with annex"""
        self._cache.clean()

    def get_file_url(self,
                     archive_file=None,
                     archive_key=None,
                     file=None,
                     size=None):
        """Given archive (file or a key) and a file -- compose URL for access

        Examples
        --------

        dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d&size=123
            when size of file within archive was known to be 123
        dl+archive:SHA256E-s176--69...3e.tar.gz#path=1/d2/2d
            when size of file within archive was not provided

        Parameters
        ----------
        size: int, optional
          Size of the file.  If not provided, will simply be empty
        """
        assert (file is not None)
        if archive_file is not None:
            if archive_key is not None:
                raise ValueError(
                    "Provide archive_file or archive_key - not both")
            archive_key = self.repo.get_file_annexinfo(archive_file)['key']
        assert (archive_key is not None)
        attrs = OrderedDict()  # looking forward for more
        if file:
            attrs['path'] = file.lstrip('/')
        if size is not None:
            attrs['size'] = size
        return str(
            URL(scheme=self.URL_SCHEME, path=archive_key, fragment=attrs))

    @property
    def cache(self):
        return self._cache

    def _parse_url(self, url):
        """Parse url and return archive key, file within archive and
        additional attributes (such as size)"""
        url = URL(url)
        assert (url.scheme == self.URL_SCHEME)
        fdict = url.fragment_dict
        if 'path' not in fdict:
            # must be old-style key/path#size=
            assert '/' in url.path, "must be of key/path format"
            key, path = url.path.split('/', 1)
        else:
            key, path = url.path, fdict.pop('path')
        if 'size' in fdict:
            fdict['size'] = int(fdict['size'])
        return key, path, fdict

    def _gen_akey_afiles(self, key, sorted=False, unique_akeys=True):
        """Given a key, yield akey, afile pairs

        if `sorted`, then first those which have extracted version in local
        cache will be yielded

        Gets determined based on urls for datalad archives

        Made "generators all the way" as an exercise but also to delay any
        checks etc until really necessary.
        """
        # we will need all URLs anyways later on ATM, so lets list() them
        # Anyways here we have a single scheme (archive) so there is not
        # much optimization possible
        urls = list(self.gen_URLS(key))

        akey_afiles = [
            self._parse_url(url)[:2]  # skip size
            for url in urls
        ]

        if unique_akeys:
            akey_afiles = unique(akey_afiles, key=itemgetter(0))

        if not sorted:
            for pair in akey_afiles:
                yield pair
            return

        # Otherwise we will go through each one

        # multiple URLs are available so we need to figure out which one
        # would be most efficient to "deal with"
        akey_afile_paths = (((akey, afile),
                             self.get_contentlocation(akey,
                                                      absolute=True,
                                                      verify_exists=False))
                            for akey, afile in akey_afiles)

        # by default get_contentlocation would return empty result for a key
        # which is not available locally.  But we could still have extracted
        # archive in the cache.  So we need pretty much get first all possible
        # and then only remove those which aren't present locally.  So
        # verify_exists was added
        yielded = set()
        akey_afile_paths_ = []

        # utilize cache to check which archives might already be present in the
        # cache
        for akey_afile, akey_path in akey_afile_paths:
            if akey_path and self.cache[akey_path].is_extracted:
                yield akey_afile
                yielded.add(akey_afile)
            akey_afile_paths_.append((akey_afile, akey_path))

        # replace generators with already collected ones into a list.  The idea
        # that in many cases we don't even need to create a full list and that
        # initial single yield would be enough, thus we don't need to check
        # locations etc for every possible hit
        akey_afile_paths = akey_afile_paths_

        # if not present in the cache -- check which are present
        # locally and choose that one to use, so it would get extracted
        for akey_afile, akey_path in akey_afile_paths:
            if akey_path and op.exists(akey_path):
                yielded.add(akey_afile)
                yield akey_afile

        # So no archive is present either in the cache or originally under
        # annex XXX some kind of a heuristic I guess is to use last_url ;-)
        if self._last_url and self._last_url in urls \
                and (len(urls) == len(akey_afiles)):
            akey_afile, _ = akey_afile_paths[urls.index(self._last_url)]
            yielded.add(akey_afile)
            yield akey_afile

        for akey_afile, _ in akey_afile_paths:
            if akey_afile not in yielded:
                yield akey_afile

    def get_contentlocation(self, key, absolute=False, verify_exists=True):
        """Return (relative to top or absolute) path to the file containing the key

        This is a wrapper around AnnexRepo.get_contentlocation which provides
        caching of the result (we are asking the location for the same archive
        key often)
        """
        if key not in self._contentlocations:
            fpath = self.repo.get_contentlocation(key, batch=True)
            if fpath:  # shouldn't store empty ones
                self._contentlocations[key] = fpath
        else:
            fpath = self._contentlocations[key]
            # but verify that it exists
            if verify_exists and not op.lexists(op.join(self.path, fpath)):
                # prune from cache
                del self._contentlocations[key]
                fpath = ''

        if absolute and fpath:
            return op.join(self.path, fpath)
        else:
            return fpath

    # Protocol implementation
    def checkurl(self, url):
        # TODO:  what about those MULTI and list to be returned?
        #  should we return all filenames or keys within archive?
        #  might be way too many?
        #  only if just archive portion of url is given or the one pointing
        #  to specific file?
        lgr.debug("Current directory: %s, url: %s", os.getcwd(), url)
        akey, afile, attrs = self._parse_url(url)
        size = attrs.get('size', None)

        # But reply that present only if archive is present
        # TODO: this would throw exception if not present, so this statement is
        # kinda bogus
        akey_path = self.get_contentlocation(akey, absolute=True)
        if akey_path:
            # Extract via cache only if size is not yet known
            if size is None:
                # if for testing we want to force getting the archive extracted
                efile = self.cache[akey_path].get_extracted_filename(afile)
                efile = ensure_bytes(efile)

                if op.exists(efile):
                    size = os.stat(efile).st_size

            # so it was a good successful one -- record
            self._last_url = url

            if size is None:
                return True
            else:
                # FIXME: providing filename causes annex to not even talk to
                # ask upon drop :-/
                return [dict(size=size)]  # , basename(afile))

        else:
            # TODO: theoretically we should first check if key is available
            # from any remote to know if file is available
            return False

    def checkpresent(self, key):
        # TODO: so we need to maintain mapping from urls to keys.  Then
        # we could even store the filename within archive
        # Otherwise it is unrealistic to even require to recompute key if we
        # knew the backend etc
        # The same content could be available from multiple locations within
        # the same archive, so let's not ask it twice since here we don't care
        # about "afile"
        for akey, _ in self._gen_akey_afiles(key, unique_akeys=True):
            if self.get_contentlocation(akey) \
                    or self.repo.is_available(akey, batch=True, key=True):
                return True
        # it is unclear to MIH why this must be UNKNOWN rather than FALSE
        # but this is how I found it
        raise RemoteError('Key not present')

    def remove(self, key):
        raise UnsupportedRequest('This special remote cannot remove content')
        # # TODO: proxy query to the underlying tarball under annex that if
        # # tarball was removed (not available at all) -- report success,
        # # otherwise failure (current the only one)
        # akey, afile = self._get_akey_afile(key)
        # if False:
        #     # TODO: proxy, checking present of local tarball is not
        #     # sufficient
        #     #  not exists(self.get_key_path(key)):
        #     self.send("REMOVE-SUCCESS", akey)
        # else:
        #     self.send("REMOVE-FAILURE", akey,
        #               "Removal from file archives is not supported")

    def whereis(self, key):
        return False
        # although more logical is to report back success, it leads to imho
        # more confusing duplication. See
        # http://git-annex.branchable.com/design/external_special_remote_protocol/#comment-3f9588f6a972ae566347b6f467b53b54
        # try:
        #     key, file = self._get_akey_afile(key)
        #     self.send("WHEREIS-SUCCESS", "file %s within archive %s" % (file, key))
        # except ValueError:
        #     self.send("WHEREIS-FAILURE")

    def transfer_retrieve(self, key, file):
        akeys_tried = []
        # the same file could come from multiple files within the same archive
        # So far it doesn't make sense to "try all" of them since if one fails
        # it means the others would fail too, so it makes sense to immediately
        # prune the list so we keep only the ones from unique akeys.
        # May be whenever we support extraction directly from the tarballs
        # we should go through all and choose the one easiest to get or smth.
        for akey, afile in self._gen_akey_afiles(key,
                                                 sorted=True,
                                                 unique_akeys=True):
            if not akey:
                lgr.warning("Got an empty archive key %r for key %s. Skipping",
                            akey, key)
                continue
            akeys_tried.append(akey)
            try:
                with lock_if_check_fails(
                        check=(self.get_contentlocation, (akey, )),
                        lock_path=(lambda k: op.join(
                            self.repo.path, '.git', 'datalad-archives-%s' % k),
                                   (akey, )),
                        operation="annex-get") as (akey_fpath, lock):
                    if lock:
                        assert not akey_fpath
                        self._annex_get_archive_by_key(akey)
                        akey_fpath = self.get_contentlocation(akey)

                if not akey_fpath:
                    raise RuntimeError(
                        "We were reported to fetch it alright but now can't "
                        "get its location.  Check logic")

                akey_path = op.join(self.repo.path, akey_fpath)
                assert op.exists(akey_path), \
                       "Key file %s is not present" % akey_path

                # Extract that bloody file from the bloody archive
                # TODO: implement/use caching, for now a simple one
                #  actually patool doesn't support extraction of a single file
                #  https://github.com/wummel/patool/issues/20
                # so
                pwd = getpwd()
                lgr.debug("Getting file {afile} from {akey_path} "
                          "while PWD={pwd}".format(**locals()))
                was_extracted = self.cache[akey_path].is_extracted
                apath = self.cache[akey_path].get_extracted_file(afile)
                link_file_load(apath, file)
                if not was_extracted and self.cache[akey_path].is_extracted:
                    self.message(
                        "%s special remote is using an extraction cache "
                        "under %s. Remove it with DataLad's 'clean' "
                        "command to save disk space." %
                        (ARCHIVES_SPECIAL_REMOTE, self.cache[akey_path].path),
                        type='info',
                    )
                return
            except Exception as exc:
                ce = CapturedException(exc)
                self.message(
                    "Failed to fetch {akey} containing {key}: {msg}".format(
                        akey=akey,
                        key=key,
                        # we need to get rid of any newlines, or we might
                        # break the special remote protocol
                        msg=str(ce).replace('\n', '|')))
                continue

        raise RemoteError("Failed to fetch any archive containing {key}. "
                          "Tried: {akeys_tried}".format(**locals()))

    def claimurl(self, url):
        scheme = urlparse(url).scheme
        if scheme in self.SUPPORTED_SCHEMES:
            return True
        else:
            return False

    def _annex_get_archive_by_key(self, akey):
        # TODO: make it more stringent?
        # Command could have fail to run if key was not present locally yet
        # Thus retrieve the key using annex
        # TODO: we need to report user somehow about this happening and
        # progress on the download
        from humanize import naturalsize

        from datalad.support.annexrepo import AnnexJsonProtocol

        akey_size = self.repo.get_size_from_key(akey)
        self.message(
            "To obtain some keys we need to fetch an archive "
            "of size %s" %
            (naturalsize(akey_size) if akey_size else "unknown"),
            type='info',
        )

        try:
            self.repo._call_annex(
                ["get", "--json", "--json-progress", "--key", akey],
                protocol=AnnexJsonProtocol,
            )
        except Exception:
            self.message(f'Failed to fetch archive with key {akey}')
            raise
Exemple #60
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedure path and configuration

    Order of consideration is user-level, system-level, dataset,
    datalad extensions, datalad. First one found according to this order is the
    one to be returned. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you do execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Returns
    -------
    tuple
      path, name, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(
            ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, n, f, h in _get_procedure_implementation(name=name,
                                                            ds=subds):
                yield m, n, f, h

    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(entry_point.module_name,
                                      'resources/procedures'), name):
                yield (
                    m,
                    n,
                ) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'), name):
        yield (
            m,
            n,
        ) + _get_proc_config(n)