def test_nitrc_pipeline(outd): get_test_providers('https://www.nitrc.org/ir/') from datalad.distribution.dataset import Dataset ds = Dataset(outd).create() with chpwd(outd): out = run_pipeline( pipeline(NITRC_IR, project='fcon_1000', subjects=['xnat_S00401'])) eq_(len(out), 1)
def test_ls_s3(): url = 's3://datalad-test0-versioned/' with swallow_outputs(): # just to skip if no credentials get_test_providers(url) with swallow_outputs() as cmo: assert_equal(ls(url), None) # not output ATM assert_in('Bucket info:', cmo.out)
def test_ls_s3(): url = 's3://datalad-test0-versioned/' with swallow_outputs(): # just to skip if no credentials get_test_providers(url) with swallow_outputs() as cmo: assert_equal(ls(url), None) # not output ATM assert_in('Bucket info:', cmo.out)
def test_version_url_deleted(): get_test_providers('s3://datalad-test0-versioned/', reload=True) # to verify having credentials to access # openfmri via S3 # it existed and then was removed fpath = "1version-removed.txt" url = "http://datalad-test0-versioned.s3.amazonaws.com/%s" % fpath turl = "http://datalad-test0-versioned.s3.amazonaws.com/%s" \ "?versionId=eZ5Hgwo8azfBv3QT7aW9dmm2sbLUY.QP" % fpath eq_(get_versioned_url(url), turl)
def test_mtime(path, url, tempfile): # let's set custom mtime file_to_download = opj(path, 'file.dat') os.utime(file_to_download, (time.time(), 1000)) assert_equal(os.stat(file_to_download).st_mtime, 1000) file_url = "%s/%s" % (url, 'file.dat') with swallow_outputs(): get_test_providers().download(file_url, path=tempfile) assert_equal(os.stat(tempfile).st_mtime, 1000)
def test_mtime(path, url, tempfile): # let's set custom mtime file_to_download = opj(path, 'file.dat') os.utime(file_to_download, (time.time(), 1000)) assert_equal(os.stat(file_to_download).st_mtime, 1000) file_url = "%s/%s" % (url, 'file.dat') with swallow_outputs(): get_test_providers().download(file_url, path=tempfile) assert_equal(os.stat(tempfile).st_mtime, 1000)
def test_get_versioned_url(): get_test_providers( 's3://openfmri/tarballs' ) # to verify having credentials to access openfmri via S3 for url_pref in ('http://openfmri.s3.amazonaws.com', 'https://s3.amazonaws.com/openfmri'): eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz"), url_pref + "/tarballs/ds001_raw.tgz?versionId=null") eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?param=1"), url_pref + "/tarballs/ds001_raw.tgz?param=1&versionId=null") # We don't duplicate the version if it already exists. eq_( get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?versionId=null"), url_pref + "/tarballs/ds001_raw.tgz?versionId=null") # something is wrong there #print(get_versioned_url("http://openfmri.s3.amazonaws.com/ds001/demographics.txt")) eq_(get_versioned_url("someurl"), "someurl") # should just return original one assert_raises(RuntimeError, get_versioned_url, "someurl", guarantee_versioned=True) # TODO: on a bucket without versioning url = "http://datalad-test0-nonversioned.s3.amazonaws.com/2versions-removed-recreated.txt" eq_(get_versioned_url(url), url) eq_(get_versioned_url(url, return_all=True), [url]) assert_raises(NotImplementedError, get_versioned_url, "s3://buga") urls = get_versioned_url( "http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt", return_all=True, verify=True) eq_(len(set(urls)), len(urls)) # all unique for url in urls: # so we didn't grab other files along with the same prefix ok_startswith( url, 'http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt?versionId=' ) # Update a versioned URL with a newer version tag. url_3ver = "http://datalad-test0-versioned.s3.amazonaws.com/3versions-allversioned.txt" url_3ver_input = url_3ver + "?versionId=b.qCuh7Sg58VIYj8TVHzbRS97EvejzEl" eq_(get_versioned_url(url_3ver_input), url_3ver_input) eq_(get_versioned_url(url_3ver_input, update=True), url_3ver + "?versionId=Kvuind11HZh._dCPaDAb0OY9dRrQoTMn")
def test_ls_s3(): url = 's3://datalad-test0-versioned/' with swallow_outputs(): # just to skip if no credentials get_test_providers(url) with swallow_outputs() as cmo: res = ls(url) assert_equal(len(res), 17) # all the entries counts = Counter(map(lambda x: x.__class__.__name__, res)) assert_equal(counts, {'Key': 14, 'DeleteMarker': 3}) assert_in('Bucket info:', cmo.out)
def test_obscure_names(path): bucket = "datalad-test2-obscurenames-versioned" get_test_providers('s3://' + bucket) # to verify having s3 credentials create(path) with externals_use_cassette('test_simple_s3_test2_obscurenames_versioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict(bucket=bucket), save=True) crawl() # fun with unicode was postponed ok_clean_git(path, annex=True) for f in ['f &$=@:+,?;', "f!-_.*'( )", 'f 1', 'f [1][2]']: ok_file_under_git(path, f, annexed=True)
def test_ls_s3(): url = 's3://datalad-test0-versioned/' with swallow_outputs(): # just to skip if no credentials get_test_providers(url) with swallow_outputs() as cmo: res = ls(url) assert_equal(len(res), 17) # all the entries counts = Counter(map(lambda x: x.__class__.__name__, res)) assert_equal(counts, {'Key': 14, 'DeleteMarker': 3}) assert_in('Bucket info:', cmo.out)
def check_basic_scenario(url, d=None): ds = Dataset(d).create() annex = ds.repo # TODO skip if no boto or no credentials get_test_providers(url) # so to skip if unknown creds # Let's try to add some file which we should have access to ds.download_url(url) ds.save() # git-annex got a fix where it stopped replacing - in the middle of the filename # Let's cater to the developers who might have some intermediate version and not # easy to compare -- we will just check that only one file there is an that it # matches what we expect when outside of the development versions range: filenames = glob.glob(op.join(d, '3versions[-_]allversioned.txt')) eq_(len(filenames), 1) filename = op.basename(filenames[0]) if external_versions['cmd:annex'] < '8.20200501': assert_in('_', filename) # Date after the fix in 8.20200501-53-gcabbc91b1 elif external_versions['cmd:annex'] >= '8.20200512': assert_in('-', filename) else: pass # either of those is ok whereis1 = annex.whereis(filename, output='full') eq_(len(whereis1), 2) # here and datalad annex.drop(filename) whereis2 = annex.whereis(filename, output='full') eq_(len(whereis2), 1) # datalad # make sure that there are no "hidden" error messages, despite the # whereis command succeeding # https://github.com/datalad/datalad/issues/6453#issuecomment-1047533276 from datalad.runner import StdOutErrCapture # we need to swallow logs since if DATALAD_LOG_LEVEL is set low, we # would get all the git-annex debug output in stderr with swallow_logs(new_level=logging.INFO) as cml: out = annex._call_annex(['whereis'], protocol=StdOutErrCapture) eq_(out['stderr'].strip(), '') # if we provide some bogus address which we can't access, we shouldn't pollute output with assert_raises(CommandError) as cme: annex.add_url_to_file('bogus', url + '_bogus') assert_in('addurl: 1 failed', cme.value.stderr)
def _annex(path): annex = Annexificator(path, special_remotes=[DATALAD_SPECIAL_REMOTE]) url = 's3://datalad-test0-versioned' providers = get_test_providers(url) # to skip if no credentials # start with a fresh bucket each time so we could reuse the same vcr tapes work providers.get_provider(url).get_downloader(url).reset() return annex
def check_download_external_url(url, failed_str, success_str, d, url_final=None): fpath = opj(d, get_url_straight_filename(url)) providers = get_test_providers(url) # url for check of credentials provider = providers.get_provider(url) downloader = provider.get_downloader(url) # Download way with swallow_outputs() as cmo: downloaded_path = downloader.download(url, path=d) assert_equal(fpath, downloaded_path) with open(fpath) as f: content = f.read() if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: downloaded_path_ = downloader.download(url, path=d, size=s, overwrite=True) # should not be affected assert_equal(downloaded_path, downloaded_path_) with open(fpath) as f: content_ = f.read() assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Fetch way content = downloader.fetch(url) if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: content_ = downloader.fetch(url, size=s) assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Verify status status = downloader.get_status(url) assert(isinstance(status, FileStatus)) if not url.startswith('ftp://'): # TODO introduce support for mtime into requests_ftp? assert(status.mtime) assert(status.size) # Verify possible redirections if url_final is None: url_final = url assert_equal(downloader.get_target_url(url), url_final)
def test_drop(path): get_test_providers('s3://datalad-test0-nonversioned') # to verify having s3 credentials create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with externals_use_cassette('test_simple_s3_test0_nonversioned_crawl_ext'), \ chpwd(path): crawl_init(template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True # so test goes faster ), save=True ) crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))
def check_download_external_url(url, failed_str, success_str, d): fpath = opj(d, get_url_straight_filename(url)) providers = get_test_providers(url) # url for check of credentials provider = providers.get_provider(url) downloader = provider.get_downloader(url) # Download way with swallow_outputs() as cmo: downloaded_path = downloader.download(url, path=d) assert_equal(fpath, downloaded_path) with open(fpath) as f: content = f.read() if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: downloaded_path_ = downloader.download(url, path=d, size=s, overwrite=True) # should not be affected assert_equal(downloaded_path, downloaded_path_) with open(fpath) as f: content_ = f.read() assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Fetch way content = downloader.fetch(url) if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: content_ = downloader.fetch(url, size=s) assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Verify status status = downloader.get_status(url) assert (isinstance(status, FileStatus)) assert (status.mtime) assert (status.size)
def check_download_external_url(url, failed_str, success_str, d): fpath = opj(d, get_url_straight_filename(url)) providers = get_test_providers(url) # url for check of credentials provider = providers.get_provider(url) downloader = provider.get_downloader(url) # Download way with swallow_outputs() as cmo: downloaded_path = downloader.download(url, path=d) assert_equal(fpath, downloaded_path) with open(fpath) as f: content = f.read() if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: downloaded_path_ = downloader.download(url, path=d, size=s, overwrite=True) # should not be affected assert_equal(downloaded_path, downloaded_path_) with open(fpath) as f: content_ = f.read() assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Fetch way content = downloader.fetch(url) if success_str is not None: assert_in(success_str, content) if failed_str is not None: assert_false(failed_str in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: content_ = downloader.fetch(url, size=s) assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Verify status status = downloader.get_status(url) assert(isinstance(status, FileStatus)) assert(status.mtime) assert(status.size)
def _test_drop(path, drop_immediately): s3url = 's3://datalad-test0-nonversioned' providers = get_test_providers(s3url) # to verify having s3 credentials # vcr tape is getting bound to the session object, so we need to # force re-establishing the session for the bucket. # TODO (in datalad): make a dedicated API for that, now too obscure _ = providers.get_status(s3url, allow_old_session=False) create(path) # unfortunately this doesn't work without force dropping since I guess vcr # stops and then gets queried again for the same tape while testing for # drop :-/ with chpwd(path): crawl_init( template="simple_s3", args=dict( bucket="datalad-test0-nonversioned", drop=True, drop_force=True, # so test goes faster drop_immediately=drop_immediately, ), save=True) if drop_immediately: # cannot figure out but taping that interaction results in # git annex addurl error. No time to figure it out # so we just crawl without vcr for now. TODO: figure out WTF with chpwd(path): crawl() else: with externals_use_cassette( 'test_simple_s3_test0_nonversioned_crawl_ext' + ('_immediately' if drop_immediately else '')), \ chpwd(path): crawl() # test that all was dropped repo = AnnexRepo(path, create=False) files = glob(_path_(path, '*')) eq_(len(files), 8) for f in files: assert_false(repo.file_has_content(f))
def check_download_external_url(url, failed_str, success_str, d, url_final=None): fpath = opj(d, get_url_straight_filename(url)) providers = get_test_providers(url) # url for check of credentials provider = providers.get_provider(url) downloader = provider.get_downloader(url) # we will load/fetch binary blobs success_bytes, failed_bytes = None, None if success_str is not None: success_bytes = success_str.encode() if failed_str is not None: failed_bytes = failed_str.encode() # Download way with swallow_outputs() as cmo: downloaded_path = downloader.download(url, path=d) assert_equal(fpath, downloaded_path) content = read_file(fpath, decode=False) if success_bytes is not None: assert_in(success_bytes, content) if failed_str is not None: assert_false(failed_bytes in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: downloaded_path_ = downloader.download(url, path=d, size=s, overwrite=True) # should not be affected assert_equal(downloaded_path, downloaded_path_) content_ = read_file(fpath, decode=False) assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Fetch way content = downloader.fetch(url, decode=False) if success_bytes is not None: assert_in(success_bytes, content) if failed_bytes is not None: assert_false(failed_bytes in content) # And if we specify size for s in [1, 2]: with swallow_outputs() as cmo: content_ = downloader.fetch(url, size=s, decode=False) assert_equal(len(content_), s) assert_equal(content_, content[:s]) # Verify status status = downloader.get_status(url) assert (isinstance(status, FileStatus)) if not url.startswith('ftp://'): # TODO introduce support for mtime into requests_ftp? assert (status.mtime) assert (status.size) # Verify possible redirections if url_final is None: url_final = url assert_equal(downloader.get_target_url(url), url_final)