Esempio n. 1
0
def test_creatsubdatasets(topds_path, n=2):
    from datalad.distribution.dataset import Dataset
    from datalad.api import create
    ds = Dataset(topds_path).create()
    paths = [op.join(topds_path, "subds%d" % i) for i in range(n)]
    paths.extend(
        op.join(topds_path, "subds%d" % i, "subsub%d" % k) for i in range(n)
        for k in range(2))
    # To allow for parallel execution without hitting the problem of
    # a lock in the super dataset, we create all subdatasets, and then
    # save them all within their superdataset
    create_ = partial(
        create,  # cfg_proc="yoda",
        result_xfm=None,
        return_type='generator')
    # if we flip the paths so to go from the end, create without --force should fail
    # and we should get the exception (the first one encountered!)
    # Note: reraise_immediately is of "concern" only for producer. since we typically
    # rely on outside code to do the killing!
    assert_raises(IncompleteResultsError, list,
                  ProducerConsumer(paths[::-1], create_, jobs=5))
    # we are in a dirty state, let's just remove all those for a clean run
    rmtree(topds_path)

    # and this one followed by save should be good IFF we provide our dependency checker
    ds = Dataset(topds_path).create()
    list(
        ProducerConsumer(paths,
                         create_,
                         safe_to_consume=no_parentds_in_futures,
                         jobs=5))
    ds.save(paths)
    assert_repo_status(ds.repo)
Esempio n. 2
0
def check_compress_file(ext, annex, path, name):
    # we base the archive name on the filename, in order to also
    # be able to properly test compressors where the corresponding
    # archive format has no capability of storing a filename
    # (i.e. where the archive name itself determines the filename
    # of the decompressed file, like .xz)
    archive = op.join(name, _filename + ext)
    compress_files([_filename], archive,
                   path=path)
    assert_true(op.exists(archive))
    if annex:
        # It should work even when file is annexed and is a symlink to the
        # key
        from datalad.support.annexrepo import AnnexRepo
        repo = AnnexRepo(path, init=True)
        repo.add(_filename)
        repo.commit(files=[_filename], msg="commit")

    dir_extracted = name + "_extracted"
    try:
        decompress_file(archive, dir_extracted)
    except MissingExternalDependency as exc:
        raise SkipTest(exc_str(exc))
    _filepath = op.join(dir_extracted, _filename)

    ok_file_has_content(_filepath, 'content')
Esempio n. 3
0
def test_ExtractedArchive(path):
    archive = op.join(path, fn_archive_obscure_ext)
    earchive = ExtractedArchive(archive)
    assert_false(op.exists(earchive.path))
    # no longer the case -- just using hash for now
    # assert_in(os.path.basename(archive), earchive.path)

    fpath = op.join(fn_archive_obscure,  # lead directory
                    fn_in_archive_obscure)
    extracted = earchive.get_extracted_filename(fpath)
    eq_(extracted, op.join(earchive.path, fpath))
    assert_false(op.exists(extracted))  # not yet

    extracted_ = earchive.get_extracted_file(fpath)
    eq_(extracted, extracted_)
    assert_true(op.exists(extracted))  # now it should

    extracted_files = earchive.get_extracted_files()
    ok_generator(extracted_files)
    eq_(sorted(extracted_files),
        sorted([
            # ['bbc/3.txt', 'bbc/abc']
            op.join(fn_archive_obscure, fn_in_archive_obscure),
            op.join(fn_archive_obscure, '3.txt')
        ]))

    earchive.clean()
    if not os.environ.get('DATALAD_TESTS_TEMP_KEEP'):
        assert_false(op.exists(earchive.path))
Esempio n. 4
0
def test_relpath_semantics(path):
    with chpwd(path):
        super = create('super')
        create('subsrc')
        sub = install(
            dataset='super', source='subsrc', path=op.join('super', 'sub'))
        eq_(sub.path, op.join(super.path, 'sub'))
Esempio n. 5
0
def check_compress_dir(ext, path, name):
    archive = name + ext
    compress_files([os.path.basename(path)], archive,
                   path=os.path.dirname(path))
    assert_true(op.exists(archive))
    name_extracted = name + "_extracted"
    decompress_file(archive, name_extracted, leading_directories='strip')
    assert_true(op.exists(op.join(name_extracted, 'empty')))
    assert_true(op.exists(op.join(name_extracted, 'd1', 'd2', 'f1')))
Esempio n. 6
0
File: wtf.py Progetto: hanke/datalad
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import os
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl-len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
def use_cassette(name, *args, **kwargs):
    """Adapter to store fixtures locally

    TODO: RF so could be used in other places as well
    """
    return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args,
                         **kwargs)
Esempio n. 8
0
def check_crawl_autoaddtext(gz, ind, topurl, outd):
    ds = create(outd)
    ds.run_procedure("cfg_text2git")
    with chpwd(outd):  # TODO -- dataset argument
        template_kwargs = {
            'url': topurl,
            'a_href_match_': '.*',
        }
        if gz:
            template_kwargs['archives_re'] = "\.gz$"
        crawl_init(template_kwargs, save=True, template='simple_with_archives')
        try:
            crawl()
        except MissingExternalDependency as exc:
            raise SkipTest(exc_str(exc))
    ok_clean_git(outd)
    ok_file_under_git(outd, "anothertext", annexed=False)
    ok_file_under_git(outd, "d/textfile", annexed=False)
    ok_file_under_git(outd, "d/tooshort", annexed=True)

    if 'compressed.dat.gz' in TEST_TREE2:
        if gz:
            ok_file_under_git(outd, "compressed.dat", annexed=False)
            ok_file_has_content(op.join(outd, "compressed.dat"),
                                u"мама мыла раму")
        else:
            ok_file_under_git(outd, "compressed.dat.gz", annexed=True)
    else:
        raise SkipTest(
            "Need datalad >= 0.11.2 to test .gz files decompression")
Esempio n. 9
0
def test_our_metadataset_search(tdir):
    # smoke test for basic search operations on our super-megadataset
    # expensive operation but ok
    ds = install(path=tdir,
                 source=DATASETS_TOPURL,
                 result_xfm='datasets',
                 return_type='item-or-list')
    res_haxby = list(ds.search('haxby'))
    assert_greater(len(res_haxby), 10)
    # default search should be case insensitive
    # but somehow it is not fully -- we get 12 here
    #res_Haxby = list(ds.search('Haxby'))
    #eq_(len(res_haxby), len(res_Haxby))

    assert_result_count(ds.search('id:873a6eae-7ae6-11e6-a6c8-002590f97d84',
                                  mode='textblob'),
                        1,
                        type='dataset',
                        path=op.join(ds.path, 'crcns', 'pfc-2'))

    # there is a problem with argparse not decoding into utf8 in PY2
    from datalad.cmdline.tests.test_main import run_main
    # TODO: make it into an independent lean test
    from datalad.cmd import Runner
    out, err = Runner(cwd=ds.path)('datalad search Buzsáki')
    assert_in('crcns/pfc-2 ', out)  # has it in description
    # and then another aspect: this entry it among multiple authors, need to
    # check if aggregating them into a searchable entity was done correctly
    assert_in('crcns/hc-1 ', out)
Esempio n. 10
0
    def _flyweight_postproc_path(cls, path):
        # we want an absolute path, but no resolved symlinks
        if not op.isabs(path):
            path = op.join(op.getpwd(), path)

        # use canonical paths only:
        return op.normpath(path)
Esempio n. 11
0
File: wtf.py Progetto: ypid/datalad
def get_max_path_length(top_path=None, maxl=1000):
    """Deduce the maximal length of the filename in a given path
    """
    if not top_path:
        top_path = getpwd()
    import random
    from datalad import lgr
    from datalad.dochelpers import exc_str
    from datalad.support import path
    prefix = path.join(top_path, "dl%d" % random.randint(1, 100000))
    # some smart folks could implement binary search for this
    max_path_length = None
    for i in range(maxl - len(prefix)):
        filename = prefix + '_' * i
        path_length = len(filename)
        try:
            with open(filename, 'w') as f:
                max_path_length = path_length
        except Exception as exc:
            lgr.debug(
                "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s",
                path_length, max_path_length, exc_str(exc))
            break
        unlink(filename)
    return max_path_length
Esempio n. 12
0
def test_ok_file_under_git_symlinks(path):
    # Test that works correctly under symlinked path
    orepo = GitRepo(path)
    orepo.add('ingit')
    orepo.commit('msg')
    orepo.add('staged')
    lpath = path + "-symlink"  # will also be removed AFAIK by our tempfile handling
    os.symlink(path, lpath)
    ok_symlink(lpath)
    ok_file_under_git(op.join(path, 'ingit'))
    ok_file_under_git(op.join(lpath, 'ingit'))
    ok_file_under_git(op.join(lpath, 'staged'))
    with assert_raises(AssertionError):
        ok_file_under_git(op.join(lpath, 'notingit'))
    with assert_raises(AssertionError):
        ok_file_under_git(op.join(lpath, 'nonexisting'))
def use_cassette(name, *args, **kwargs):
    """Adapter to store fixtures locally and skip if there is no vcr

    TODO: RF local aspect so could be used in other places as well
    """
    kwargs.setdefault('skip_if_no_vcr', True)
    return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)
Esempio n. 14
0
def test__version__():
    # in released stage, version in the last CHANGELOG entry
    # should correspond to the one in datalad
    CHANGELOG_filename = op.join(
        op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md')
    if not op.exists(CHANGELOG_filename):
        raise SkipTest("no %s found" % CHANGELOG_filename)
    regex = re.compile(r'^## '
                       r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+'
                       r'\((?P<date>.*)\)'
                       r'\s+--\s+'
                       r'(?P<codename>.+)'
                       )
    with open(CHANGELOG_filename, 'rb') as f:
        for line in f:
            line = line.rstrip()
            if not line.startswith(b'## '):
                # The first section header we hit, must be our changelog entry
                continue
            reg = regex.match(assure_unicode(line))
            if not reg:  # first one at that level is the one
                raise AssertionError(
                    "Following line must have matched our regex: %r" % line)
            regd = reg.groupdict()
            changelog_version = regd['version']
            lv_changelog_version = LooseVersion(changelog_version)
            # we might have a suffix - sanitize
            san__version__ = __version__.rstrip('.devdirty')
            lv__version__ = LooseVersion(san__version__)
            if '???' in regd['date'] and 'will be better than ever' in regd['codename']:
                # we only have our template
                # we can only assert that its version should be higher than
                # the one we have now
                assert_greater(lv_changelog_version, lv__version__)
            else:
                # should be a "release" record
                assert_not_in('???', regd['date'])
                assert_not_in('will be better than ever', regd['codename'])
                assert_equal(__hardcoded_version__, changelog_version)
                if __hardcoded_version__ != san__version__:
                    # It was not tagged yet and Changelog should have its
                    # template record for the next release
                    assert_greater(lv_changelog_version, lv__version__)
                    assert_in('.dev', san__version__)
                else:
                    # all is good, tagged etc
                    assert_equal(lv_changelog_version, lv__version__)
                    assert_equal(changelog_version, san__version__)
                    assert_equal(__hardcoded_version__, san__version__)
            return

    raise AssertionError(
        "No log line matching our regex found in %s" % CHANGELOG_filename
    )
Esempio n. 15
0
def check_decompress_file(leading_directories, path):
    outdir = op.join(path, 'simple-extracted')

    with swallow_outputs() as cmo:
        decompress_file(op.join(path, fn_archive_obscure_ext),
                        outdir,
                        leading_directories=leading_directories)
        eq_(cmo.out, "")
        eq_(cmo.err, "")

    path_archive_obscure = op.join(outdir, fn_archive_obscure)
    if leading_directories == 'strip':
        assert_false(op.exists(path_archive_obscure))
        testpath = outdir
    elif leading_directories is None:
        assert_true(op.exists(path_archive_obscure))
        testpath = path_archive_obscure
    else:
        raise NotImplementedError("Dunno about this strategy: %s" %
                                  leading_directories)

    assert_true(op.exists(op.join(testpath, '3.txt')))
    assert_true(op.exists(op.join(testpath, fn_in_archive_obscure)))
    with open(op.join(testpath, '3.txt')) as f:
        eq_(f.read(), '3 load')
Esempio n. 16
0
def test_ArchivesCache():
    # we don't actually need to test archives handling itself
    path1 = "/zuba/duba"
    path2 = "/zuba/duba2"
    # should not be able to create a persistent cache without topdir
    assert_raises(ValueError, ArchivesCache, persistent=True)
    cache = ArchivesCache()  # by default -- non persistent

    archive1_path = op.join(path1, fn_archive_obscure_ext)
    archive2_path = op.join(path2, fn_archive_obscure_ext)
    cached_archive1_path = cache[archive1_path].path
    assert_false(cache[archive1_path].path == cache[archive2_path].path)
    assert_true(cache[archive1_path] is cache[archive1_path])
    cache.clean()
    assert_false(op.exists(cached_archive1_path))
    assert_false(op.exists(cache.path))

    # test del
    cache = ArchivesCache()  # by default -- non persistent
    assert_true(op.exists(cache.path))
    cache_path = cache.path
    del cache
    assert_false(op.exists(cache_path))
Esempio n. 17
0
def check_datasets_datalad_org(suffix, tdir):
    # Test that git annex / datalad install, get work correctly on our datasets.datalad.org
    # Apparently things can break, especially with introduction of the
    # smart HTTP backend for apache2 etc
    ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix)
    eq_(ds.config.get('remote.origin.annex-ignore', None), None)
    # assert_result_count and not just assert_status since for some reason on
    # Windows we get two records due to a duplicate attempt (as res[1]) to get it
    # again, which is reported as "notneeded".  For the purpose of this test
    # it doesn't make a difference.
    assert_result_count(
        ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')),
        1,
        status='ok')
    assert_status('ok', ds.remove())
Esempio n. 18
0
def check_datasets_datalad_org(suffix, tdir):
    # Test that git annex / datalad install, get work correctly on our datasets.datalad.org
    # Apparently things can break, especially with introduction of the
    # smart HTTP backend for apache2 etc
    ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix)
    eq_(ds.config.get('remote.origin.annex-ignore', None), None)
    # assert_result_count and not just assert_status since for some reason on
    # Windows we get two records due to a duplicate attempt (as res[1]) to get it
    # again, which is reported as "notneeded".  For the purpose of this test
    # it doesn't make a difference.
    # git-annex version is not "real" - but that is about when fix was introduced
    from datalad import cfg
    if on_windows \
        and cfg.obtain("datalad.repo.version") < 6 \
        and external_versions['cmd:annex'] <= '7.20181203':
        raise SkipTest("Known to fail, needs fixed git-annex")
    assert_result_count(
        ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')),
        1,
        status='ok')
    assert_status('ok', ds.remove())
Esempio n. 19
0
def check_datasets_datalad_org(suffix, tdir):
    # Test that git annex / datalad install, get work correctly on our datasets.datalad.org
    # Apparently things can break, especially with introduction of the
    # smart HTTP backend for apache2 etc
    ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix)
    eq_(ds.config.get('remote.origin.annex-ignore', None), None)
    # assert_result_count and not just assert_status since for some reason on
    # Windows we get two records due to a duplicate attempt (as res[1]) to get it
    # again, which is reported as "notneeded".  For the purpose of this test
    # it doesn't make a difference.
    # git-annex version is not "real" - but that is about when fix was introduced
    from datalad import cfg
    if on_windows \
        and cfg.obtain("datalad.repo.version") < 6 \
        and external_versions['cmd:annex'] <= '7.20181203':
        raise SkipTest("Known to fail, needs fixed git-annex")
    assert_result_count(
        ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')),
        1,
        status='ok')
    assert_status('ok', ds.remove())
Esempio n. 20
0
def path_under_rev_dataset(ds, path):
    ds_path = ds.pathobj
    try:
        rpath = str(ut.Path(path).relative_to(ds_path))
        if not rpath.startswith(op.pardir):
            # path is already underneath the dataset
            return path
    except Exception:
        # whatever went wrong, we gotta play save
        pass

    root = get_dataset_root(str(path))
    while root is not None and not ds_path.samefile(root):
        # path and therefore root could be relative paths,
        # hence in the next round we cannot use dirname()
        # to jump in the the next directory up, but we have
        # to use ./.. and get_dataset_root() will handle
        # the rest just fine
        root = get_dataset_root(op.join(root, op.pardir))
    if root is None:
        return None
    return ds_path / op.relpath(str(path), root)
Esempio n. 21
0
def test_no_blows(cookiesdir):
    cookies = CookiesDB(op.join(cookiesdir, 'mycookies'))
    # set the cookie
    cookies['best'] = 'mine'
    assert_equal(cookies['best'], 'mine')
    """
    Somehow this manages to trigger on conda but not on debian for me
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/shelve.py", line 125, in __setitem__
        self.dict[key.encode(self.keyencoding)] = f.getvalue()
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 216, in __setitem__
        self._index[key] = self._setval(pos, val)
    File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 178, in _setval
        with _io.open(self._datfile, 'rb+') as f:
        FileNotFoundError: [Errno 2] No such file or directory: '/home/yoh/.tmp/datalad_temp_test_no_blowsalnsw_wk/mycookies.dat'

    on Debian (python 3.7.3~rc1-1) I just get a warning: BDB3028 /home/yoh/.tmp/datalad_temp_test_no_blows58tdg67s/mycookies.db: unable to flush: No such file or directory
    """
    try:
        rmtree(cookiesdir)
    except OSError:
        # on NFS directory might still be open, so .nfs* lock file would prevent
        # removal, but it shouldn't matter and .close should succeed
        pass
    cookies.close()
Esempio n. 22
0
def test_get_leading_directory():
    ea = ExtractedArchive('/some/bogus', '/some/bogus')
    yield _test_get_leading_directory, ea, [], None
    yield _test_get_leading_directory, ea, ['file.txt'], None
    yield _test_get_leading_directory, ea, ['file.txt', op.join('d', 'f')], None
    yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], 'd'
    yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], 'd', {'consider': 'd'}
    yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d', 'f2')], None, {'consider': 'dd'}
    yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('d2', 'f2')], None
    yield _test_get_leading_directory, ea, [op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], op.join('d', 'd2')
    yield _test_get_leading_directory, ea, [op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], 'd', {'depth': 1}
    # with some parasitic files
    yield _test_get_leading_directory, ea, [op.join('d', 'f'), op.join('._d')], 'd', {'exclude': ['\._.*']}
    yield _test_get_leading_directory, ea, [op.join('d', 'd1', 'f'), op.join('d', '._d'), '._x'], op.join('d', 'd1'), {'exclude': ['\._.*']}
Esempio n. 23
0
def test_direct_cfg(path1, path2):
    # and if repo already exists and we have env var - we fail too
    # Adding backend so we get some commit into the repo
    ar = AnnexRepo(path1, create=True, backend='MD5E')
    del ar
    AnnexRepo._unique_instances.clear()  # fight flyweight
    for path in (path1, path2):
        with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
            # try to create annex repo in direct mode as see how it fails
            with assert_raises(DirectModeNoLongerSupportedError) as cme:
                AnnexRepo(path, create=True)
            assert_in("no longer supported by DataLad",
                      str(cme.exception))  # we have generic part
            assert_in("datalad.repo.direct configuration",
                      str(cme.exception))  # situation specific part
    # assert not op.exists(path2)   # that we didn't create it - we do!
    #   fixing for that would be too cumbersome since we first call GitRepo.__init__
    #   with create
    ar = AnnexRepo(path1)
    # check if we somehow didn't reset the flag
    assert not ar.is_direct_mode()

    if not ar.check_direct_mode_support():
        raise SkipTest(
            "Rest of test requires direct mode support in git-annex")

    # TODO: Remove the rest of this test once GIT_ANNEX_MIN_VERSION is
    # at least 7.20190912 (which dropped direct mode support).

    if ar.config.obtain("datalad.repo.version") >= 6:
        raise SkipTest(
            "Created repo not v5, cannot test detection of direct mode repos")
    # and if repo existed before and was in direct mode, we fail too
    # Since direct= option was deprecated entirely, we use protected method now
    ar._set_direct_mode(True)
    assert ar.is_direct_mode()
    del ar  # but we would need to disable somehow the flywheel
    with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
        with assert_raises(DirectModeNoLongerSupportedError) as cme:
            AnnexRepo(path1, create=False)

    # TODO: RM DIRECT decide what should we here -- should we test/blow?
    #   ATM both tests below just pass
    ar2 = AnnexRepo(path2, create=True)
    # happily can do it since it doesn't need a worktree to do the clone
    ar2.add_submodule('sub1', url=path1)
    ar2sub1 = AnnexRepo(op.join(path2, 'sub1'))
    # but now let's convert that sub1 to direct mode
    assert not ar2sub1.is_direct_mode()
    ar2sub1._set_direct_mode(True)
    assert ar2sub1.is_direct_mode()
    del ar2
    del ar2sub1
    AnnexRepo._unique_instances.clear()  # fight flyweight

    ar2 = AnnexRepo(path2)
    list(ar2.get_submodules_())

    # And what if we are trying to add pre-cloned repo in direct mode?
    ar2sub2 = AnnexRepo.clone(path1, op.join(path2, 'sub2'))
    ar2sub2._set_direct_mode(True)
    del ar2sub2
    AnnexRepo._unique_instances.clear()  # fight flyweight
    ar2.add('sub2')
Esempio n. 24
0
    '.bidsignore',
    'code/**',
    '*.tsv',
    '*.json',
    '*.txt',
]
# just to be sure + _scans.tsv could contain dates
force_in_annex = [
    '*.nii.gz',
    '*.tgz',
    '*_scans.tsv',
]
# make an attempt to discover the prospective change in .gitattributes
# to decide what needs to be done, and make this procedure idempotent
# (for simple cases)
attr_fpath = op.join(ds.path, '.gitattributes')
if op.lexists(attr_fpath):
    with open(attr_fpath, 'rb') as f:
        attrs = f.read().decode()
else:
    attrs = ''

for paths, largefile in [
    (force_in_annex, 'anything'),
    (force_in_git, 'nothing'),
]:
    # amend gitattributes, if needed
    ds.repo.set_gitattributes([
        (path, {
            'annex.largefiles': largefile
        }) for path in paths
    def __call__(dataset,
                 filename=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from unittest.mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo

        import logging
        lgr = logging.getLogger('datalad.local.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename,
                                 default_filename)  # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(repo_files,
                                              allow_quick=True,
                                              batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(repo_files,
                                                    allow_quick=True,
                                                    batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning
                             if missing_content == 'continue' else lgr.debug)(
                                 'File %s has no content available, skipped',
                                 fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' %
                                          fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(
                                opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(fpath,
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(status='ok',
                   path=filename,
                   type='file',
                   action='export_archive',
                   logger=lgr)
Esempio n. 26
0
)
from datalad.support.exceptions import (
    MissingExternalDependency,
)
try:
    import github as gh
except ImportError:
    # make sure that the command complains too
    assert_raises(MissingExternalDependency, create_sibling_github, 'some')
    raise SkipTest


# Keep fixtures local to this test file
from datalad.support import path as op

FIXTURES_PATH = op.join(op.dirname(__file__), 'vcr_cassettes')


def use_cassette(name, *args, **kwargs):
    """Adapter to store fixtures locally

    TODO: RF so could be used in other places as well
    """
    return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)


@with_tempfile
def test_invalid_call(path):
    # no dataset
    assert_raises(ValueError, create_sibling_github, 'bogus', dataset=path)
    ds = Dataset(path).create()
)
from datalad.support.exceptions import (
    MissingExternalDependency,
)
try:
    import github as gh
except ImportError:
    # make sure that the command complains too
    assert_raises(MissingExternalDependency, create_sibling_github, 'some')
    raise SkipTest


# Keep fixtures local to this test file
from datalad.support import path as op

FIXTURES_PATH = op.join(op.dirname(__file__), 'vcr_cassettes')


def use_cassette(name, *args, **kwargs):
    """Adapter to store fixtures locally and skip if there is no vcr

    TODO: RF local aspect so could be used in other places as well
    """
    kwargs.setdefault('skip_if_no_vcr', True)
    return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)


@with_tempfile
def test_invalid_call(path):
    # no dataset
    assert_raises(ValueError, create_sibling_github, 'bogus', dataset=path)
Esempio n. 28
0
def use_cassette(name, *args, **kwargs):
    """Adapter to store fixtures locally

    TODO: RF so could be used in other places as well
    """
    return use_cassette_(op.join(FIXTURES_PATH, name + '.yaml'), *args, **kwargs)
Esempio n. 29
0
def resolve_path(path, ds=None, ds_resolved=None):
    """Resolve a path specification (against a Dataset location)

    Any path is returned as an absolute path. If, and only if, a dataset
    object instance is given as `ds`, relative paths are interpreted as
    relative to the given dataset. In all other cases, relative paths are
    treated as relative to the current working directory.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike or list
      Platform-specific path specific path specification. Multiple path
      specifications can be given as a list
    ds : Dataset or PathLike or None
      Dataset instance to resolve relative paths against.
    ds_resolved : Dataset or None
      A dataset instance that was created from `ds` outside can be provided
      to avoid multiple instantiation on repeated calls.

    Returns
    -------
    `pathlib.Path` object or list(Path)
      When a list was given as input a list is returned, a Path instance
      otherwise.
    """
    got_ds_instance = isinstance(ds, Dataset)
    if ds is not None and not got_ds_instance:
        ds = ds_resolved or require_dataset(
            ds, check_installed=False, purpose='path resolution')
    out = []
    pwd_parts = None  # get it upon first use but only once
    for p in ensure_list(path):
        if ds is None or not got_ds_instance:
            # no dataset at all or no instance provided -> CWD is always the reference
            # nothing needs to be done here. Path-conversion and absolutification
            # are done next
            pass
        # we have a given datasets instance
        elif not Path(p).is_absolute():
            # we have a dataset and no abspath nor an explicit relative path ->
            # resolve it against the dataset
            p = ds.pathobj / p

        p = ut.Path(p)

        # make sure we return an absolute path, but without actually
        # resolving anything
        if not p.is_absolute():
            # in general it is almost impossible to use resolve() when
            # we can have symlinks in the root path of a dataset
            # (that we don't want to resolve here), symlinks to annex'ed
            # files (that we never want to resolve), and other within-repo
            # symlinks that we (sometimes) want to resolve (i.e. symlinked
            # paths for addressing content vs adding content)
            # CONCEPT: do the minimal thing to catch most real-world inputs
            # ASSUMPTION: the only sane relative path input that needs
            # handling and can be handled are upward references like
            # '../../some/that', whereas stuff like 'down/../someotherdown'
            # are intellectual exercises
            # ALGORITHM: match any number of leading '..' path components
            # and shorten the PWD by that number
            # NOT using ut.Path.cwd(), because it has symlinks resolved!!
            if not pwd_parts:
                pwd_parts = ut.Path(getpwd()).parts
            path_parts = p.parts
            leading_parents = 0
            for pp in p.parts:
                if pp == op.pardir:
                    leading_parents += 1
                    path_parts = path_parts[1:]
                elif pp == op.curdir:
                    # we want to discard that, but without stripping
                    # a corresponding parent
                    path_parts = path_parts[1:]
                else:
                    break
            p = ut.Path(
                op.join(*(
                    pwd_parts[:-leading_parents if leading_parents else None] +
                    path_parts)))
        # note that we will not "normpath()" the result, check the
        # pathlib docs for why this is the only sane choice in the
        # face of the possibility of symlinks in the path
        out.append(p)
    return out[0] if isinstance(path, (str, PurePath)) else out
Esempio n. 30
0
    def __call__(dataset, filename=None, archivetype='tar', compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti
        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype,
            '{}{}'.format(
                '.' if compression else '',
                compression) if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename, default_filename) # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(
                    repo_files, allow_quick=True, batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(
                    repo_files, allow_quick=True, batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning if missing_content == 'continue' else lgr.debug)(
                                'File %s has no content available, skipped', fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' % fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(
                    fpath,
                    arcname=aname,
                    **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(
            status='ok',
            path=filename,
            type='file',
            action='export_archive',
            logger=lgr)
Esempio n. 31
0
# will get its own .gitattributes entry to keep it out of the annex
# give relative path to dataset root (use platform notation)
force_in_git = [
    'README',
    'CHANGES',
    'dataset_description.json',
    '.bidsignore',
    'code/**',
    # to not put participants or scan info into Git, might contain sensitive
    # information
    #'*.tsv',
]
# make an attempt to discover the prospective change in .gitattributes
# to decide what needs to be done, and make this procedure idempotent
# (for simple cases)
attr_fpath = op.join(ds.path, '.gitattributes')
if op.lexists(attr_fpath):
    with open(attr_fpath, 'rb') as f:
        attrs = f.read().decode()
else:
    attrs = ''

# amend gitattributes, if needed
ds.repo.set_gitattributes([
    (path, {'annex.largefiles': 'nothing'})
    for path in force_in_git
    if '{} annex.largefiles=nothing'.format(path) not in attrs
])

# leave clean
ds.save(
Esempio n. 32
0
    assert_false(op.exists(cache.path))

    # test del
    cache = ArchivesCache()  # by default -- non persistent
    assert_true(op.exists(cache.path))
    cache_path = cache.path
    del cache
    assert_false(op.exists(cache_path))


@pytest.mark.parametrize(
    "return_value,target_value,kwargs",
    [
        ([], None, {}),
        (['file.txt'], None, {}),
        (['file.txt', op.join('d', 'f')], None, {}),
        ([op.join('d', 'f'), op.join('d', 'f2')], 'd', {}),
        ([op.join('d', 'f'), op.join('d', 'f2')], 'd', {
            'consider': 'd'
        }),
        ([op.join('d', 'f'), op.join('d', 'f2')], None, {
            'consider': 'dd'
        }),
        ([op.join('d', 'f'), op.join('d2', 'f2')], None, {}),
        ([op.join('d', 'd2', 'f'),
          op.join('d', 'd2', 'f2')], op.join('d', 'd2'), {}),
        ([op.join('d', 'd2', 'f'),
          op.join('d', 'd2', 'f2')], 'd', {
              'depth': 1
          }),
        # with some parasitic files