def fetch_data(tmpdir, subject): """Fetches some test dicoms using datalad""" from datalad import api targetdir = op.join(tmpdir, 'QA') api.install(path=targetdir, source='http://datasets-tests.datalad.org/dbic/QA') api.get('{}/sourcedata/{}'.format(targetdir, subject)) return targetdir
def fetch_data(tmpdir, dicoms): """Fetches some test DICOMs using datalad""" data = os.path.join(tmpdir, 'data') api.install(path=data, source=DICOM_DIR) data = os.path.join(data, dicoms) api.get(path=data) return data
def fetch_data(tmpdir, dataset, getpath=None): """ Utility function to interface with datalad database. Performs datalad `install` and datalad `get` operations. Parameters ---------- tmpdir : str directory to temporarily store data dataset : str dataset path from `http://datasets-tests.datalad.org` getpath : str [optional] exclusive path to get Returns ------- targetdir : str directory with installed dataset """ from datalad import api targetdir = op.join(tmpdir, op.basename(dataset)) api.install(path=targetdir, source='http://datasets-tests.datalad.org/{}'.format(dataset)) getdir = targetdir + (op.sep + getpath if getpath is not None else '') api.get(getdir) return targetdir
def test_install_crcns(tdir, ds_path): with chpwd(tdir): with swallow_logs(new_level=logging.INFO) as cml: install("all-nonrecursive", source='///') # since we didn't log decorations such as log level atm while # swallowing so lets check if exit code is returned or not # I will test both assert_not_in('ERROR', cml.out) # below one must not fail alone! ;) assert_not_in('with exit code', cml.out) # should not hang in infinite recursion with chpwd('all-nonrecursive'): get("crcns") ok_(exists(_path_("all-nonrecursive/crcns/.git/config"))) # and we could repeat installation and get the same result ds1 = install(_path_("all-nonrecursive/crcns")) ds2 = Dataset('all-nonrecursive').install('crcns') ok_(ds1.is_installed()) eq_(ds1, ds2) eq_(ds1.path, ds2.path) # to make sure they are a single dataset # again, but into existing dataset: ds = create(ds_path) crcns = ds.install("///crcns") ok_(crcns.is_installed()) eq_(crcns.path, opj(ds_path, "crcns")) assert_in(crcns.path, ds.get_subdatasets(absolute=True))
def fetch_data(tmpdir, subject): """Fetches some test dicoms using datalad""" from datalad import api targetdir = op.join(tmpdir, 'QA') api.install(path=targetdir, source='///dbic/QA') api.get('{}/sourcedata/{}'.format(targetdir, subject)) return targetdir
def test_real_data(infile): dl.get(infile) data = np.recfromcsv(infile, delimiter='\t', names=['x', 'y', 'pupil', 'frame']) clf = d.EyegazeClassifier( #px2deg=0.0185581232561, px2deg=0.0266711972026, sampling_rate=1000.0) p = clf.preproc(data) events = clf(p[:50000], #p, ) evdf = ut.events2df(events) labels = list(evdf['label']) # find all kinds of events for t in ('FIXA', 'PURS', 'SACC', 'LPSO', 'HPSO', 'ISAC', 'IHPS'): # 'ILPS' one file doesn't have any assert t in labels return ut.show_gaze(pp=p[:50000], events=events) #ut.show_gaze(pp=p, events=events) import pylab as pl saccades = evdf[evdf['label'] == 'SACC'] isaccades = evdf[evdf['label'] == 'ISAC'] print('#saccades', len(saccades), len(isaccades)) pl.plot(saccades['amp'], saccades['peak_vel'], '.', alpha=.3) pl.plot(isaccades['amp'], isaccades['peak_vel'], '.', alpha=.3) pl.show()
def check_comparison_dir(self): cmpr_path = self.data.comparison_dir if not cmpr_path.exists(): raise ValueError( "The following path does not exist but is required to " f"perform a test:{cmpr_path}.\n You may wish to run the " "test with the --create_sample_output flag or generate " "output for future test sessions with " "--save_sample_output. ") cmpr_files = list(cmpr_path.glob("**/*")) cmpr_files_rel = [f.relative_to(cmpr_path) for f in cmpr_files] files_required = [ f.relative_to(self.data.outdir) for f in self.file_list ] missing_files = [] for f in files_required: if not f in cmpr_files_rel: missing_files.append(str(cmpr_path / f)) if missing_files: m_str = " ".join(missing_files) raise ValueError( "The following files are missing and are required to " f"fully complete the test: {m_str} ") need_data = any(p.is_symlink() and not p.exists() for p in cmpr_files) if need_data: datalad.get(str(cmpr_path))
def test_is_installed(src, path): ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: AnnexRepo.clone(src, path) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) # We must not be able to create a new repository under a known # subdataset path. # Note: Unfortunately we would still be able to generate it under # subdirectory within submodule, e.g. `subm 1/subdir` but that is # not checked here. `rev-create` will provide that protection # when create/rev-create merge. with assert_raises(PathKnownToRepositoryError): subds.create() # get the submodule # This would init so there is a .git file with symlink info, which is # as we agreed is more pain than gain, so let's use our install which would # do it right, after all we are checking 'is_installed' ;) # from datalad.cmd import Runner # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path) with chpwd(path): get('subm 1') ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def savegaze(): """ small function to generate and save remodnav classification figures """ from remodnav.tests import utils as ut import pylab as pl import datalad.api as dl # use two examplary files (lab + MRI) used during testing as well # hardcoding those, as I see no reason for updating them infiles = [ op.join( 'data', 'raw_eyegaze', 'sub-32', 'beh', 'sub-32_task-movie_run-5_recording-eyegaze_physio.tsv.gz'), op.join( 'data', 'raw_eyegaze', 'sub-02', 'ses-movie', 'func', 'sub-02_ses-movie_task-movie_run-5_recording-eyegaze_physio.tsv.gz' ), ] # one call per file due to https://github.com/datalad/datalad/issues/3356 for f in infiles: dl.get(f) for f in infiles: # read data data = np.recfromcsv(f, delimiter='\t', names=['x', 'y', 'pupil', 'frame']) # adjust px2deg conversion factor according to datafile pxdeg, ext = (0.0266711972026, 'lab') if '32' in f \ else (0.0185581232561, 'mri') clf = EyegazeClassifier( px2deg=pxdeg, sampling_rate=1000.0) p = clf.preproc(data) # lets go with 10 seconds to actually see details. This particular time # window is within the originally plotted 50s and contains missing data # for both data types (lab & mri) events = clf(p[15000:25000]) fig = pl.figure( # fake size to get the font size down in relation figsize=(14, 2), dpi=120, frameon=False) ut.show_gaze( pp=p[15000:25000], events=events, sampling_rate=1000.0, show_vels=True, coord_lim=(0, 1280), vel_lim=(0, 1000)) pl.savefig( op.join('img', 'remodnav_{}.svg'.format(ext)), transparent=True, bbox_inches="tight") pl.close()
def plot_dist(figures): """ Plot the events duration distribution per movie run, per data set. """ import pandas as pd # do nothing if we don't want to plot if not figures: return import datalad.api as dl dl.install(op.join('data', 'studyforrest-data-eyemovementlabels')) datapath = op.join('data', 'studyforrest-data-eyemovementlabels', 'sub*', '*.tsv') data = sorted(glob(datapath)) dl.get(dataset='.', path=data) for ds, ds_name in [(mri_ids, 'mri'), (lab_ids, 'lab')]: dfs = [ pd.read_csv(f, header=0, delim_whitespace=True) for f in data if any('sub-{}'.format(i) in f for i in ds) ] df = pd.concat(dfs) # thats a concatinated dataframe with all files from one dataset (lab or mri) # extract relevant event types SACs = df[(df.label == 'SACC') | (df.label == 'ISACS')] FIX = df[df.label == 'FIXA'] PSOs = df[(df.label == 'HPSO') | (df.label == 'IHPS') | (df.label == 'LPSO') | (df.label == 'ILPS')] PURs = df[df.label == 'PURS'] # plot a histogram. Set the same x-axis limits as NH for fixations and saccades, # and exclude outlying 0.5% for other events for (ev_df, label, x_lim, y_lim) in [ (SACs, 'saccade', (0, 0.16), (1, 62000)), (FIX, 'fixation', (0, 1.0), (1, 50000)), (PSOs, 'PSO', (0, 0.04), (1, 26000)), (PURs, 'pursuit', (0, .8), (1, 30000))]: fig = pl.figure(figsize=(3,2)) pl.hist(ev_df['duration'].values, bins='doane', range=x_lim, color='gray') #log=True) pl.xlabel('{} duration in s'.format(label)) pl.xlim(x_lim) pl.ylim(y_lim) pl.savefig( op.join( 'img', 'hist_{}_{}.svg'.format( label, ds_name)), transparent=True, bbox_inches="tight") pl.close()
def test_cmdline(infile, tmpdir): import remodnav dl.get(infile) outfname = tmpdir.mkdir('bids').join("events.tsv").strpath remodnav.main(['fake', infile, outfname, '0.0266711972026', '1000']) assert op.exists(outfname) assert op.exists(outfname[:-4] + '.png')
def _fetch_data(datadir, dicoms): try: """Fetches some test DICOMs using datalad""" api.install(path=datadir, source=DICOM_DIR) data = os.path.join(datadir, dicoms) api.get(path=data) except IncompleteResultsError as exc: pytest.skip("Failed to fetch test data: %s" % str(exc)) return data
def main(): """Entry point""" thispath = os.getcwd() opts = get_parser().parse_args() np.random.seed(opts.seed) out_file = None if opts.output_file is not None: out_file = os.path.abspath(opts.output_file) os.chdir(opts.openfmri_dir) all_sub = sorted(glob.glob('ds*/sub-*')) datasets = {} multises = set() for subj in all_sub: ds = subj.split('/')[0] if os.path.isdir(os.path.join(subj, 'anat')) and os.path.isdir(os.path.join(subj, 'func')): datasets.setdefault(ds, []).append(os.path.basename(subj)) elif (glob.glob(os.path.join(subj, 'ses-*', 'anat')) and glob.glob(os.path.join(subj, 'ses-*', 'func'))): multises.add(ds) datasets.setdefault(ds, []).append(os.path.basename(subj)) subsample = {} n_sample = 0 for ds, sublist in datasets.items(): n_sample += min(opts.num_participants, len(sublist)) if len(sublist) <= opts.num_participants: subsample[ds] = sublist else: subsample[ds] = sorted(np.random.choice( sublist, size=opts.num_participants, replace=False).tolist()) # Double check everything looks good assert n_sample == len([sub for _, sublist in datasets.items() for sub in sublist]) if out_file is not None: import yaml with open(out_file, 'w') as outfh: outfh.write(yaml.dump(subsample)) print('Sampled participants stored to %s' % out_file) singleses = set(datasets.keys()) - multises print('Sampled %d participants' % n_sample) print('Datasets summary:\n\tSingle-session=%d' '\n\tMulti-session=%d' '\n\tTotal participants=%d' % (len(singleses), len(multises), n_sample)) os.chdir(thispath) if opts.datalad_fetch: import datalad.api as dlad for ds, sublist in subsample.items(): for sub in sublist: dlad.get(path=os.path.join(opts.openfmri_dir, ds, sub), recursive=True, jobs=opts.njobs, verbose=True)
def test_get_in_unavailable_subdataset(src, path): origin_ds = _make_dataset_hierarchy(src) root = install(path, source=src) targetpath = opj('sub1', 'sub2') targetabspath = opj(root.path, targetpath) get(targetabspath) # we got the dataset, and its immediate content, but nothing below sub2 = Dataset(targetabspath) ok_(sub2.is_installed()) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
def read_data(data): """ Get and read in data. """ # if the data is not retrieved, get it using datalad get. dl.get(data) # read data into a pandas dataframe df = pd.read_csv(data) attributes = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "class" ] df.columns = attributes return df
def _datalad_get(filepath): if not filepath: return from datalad import api from datalad.support.exceptions import IncompleteResultsError try: api.get(str(filepath)) except IncompleteResultsError as exc: if exc.failed[0]['message'] == 'path not associated with any dataset': from .conf import TF_GITHUB_SOURCE api.install(path=TF_LAYOUT.root, source=TF_GITHUB_SOURCE, recursive=True) api.get(str(filepath)) else: raise
def test_install_known_subdataset(src, path): # get the superdataset: ds = install(path, source=src) # subdataset not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths')) # install it: ds.install('subm 1') ok_(subds.is_installed()) ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False)) # Verify that it is the correct submodule installed and not # new repository initiated eq_(set(subds.repo.get_indexed_files()), {'test.dat', 'INFO.txt', 'test-annex.dat'}) assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths')) assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths')) # now, get the data by reinstalling with -g: ok_(subds.repo.file_has_content('test-annex.dat') is False) with chpwd(ds.path): result = get(path='subm 1', dataset=os.curdir) assert_in_results(result, path=opj(subds.path, 'test-annex.dat')) ok_(subds.repo.file_has_content('test-annex.dat') is True) ok_(subds.is_installed())
def download_files(dataset, filenames, time_limit=120): if len(filenames) == 0: return responses = [] with timeout(time_limit): for filename in filenames: full_path = os.path.join(dataset, filename) responses = api.get(path=full_path, on_failure="ignore") for response in responses: if response.get("status") in ["ok", "notneeded"]: continue if response.get("status") in ["impossible", "error"]: pytest.fail( f"{full_path}\n{response.get('message')}", pytrace=False ) if not responses: pytest.fail( f"The dataset timed out after {time_limit} seconds before retrieving a file." " Cannot to tell if the download would be sucessful." f"\n{filename} has size of {humanize.naturalsize(get_annexed_file_size(dataset, filename))}.", pytrace=False, )
def get_dataset_data(ds, path_to_get, verbose=False, parallelized=None): """ Gets data from dataset (ds) using datalad.api.get() Returns a list of file paths to the files that would have been downloaded by this command, even if they already existed in the filesystem Throws exception if a bad status is returned from datalad.api.get() parallelized is either 'None' or an integer describing the number of jobs to use (passed directly to datalad.api.get) If verbose is True, datalad.api.get will print out the list of files it is downloading in json pretty-print """ get_kwargs = {'path': path_to_get, 'dataset': ds, 'jobs': parallelized} if verbose: get_kwargs['result_renderer'] = 'json_pp' specific_data = api.get(**get_kwargs) file_paths = [] for data_file_response in specific_data: assert data_file_response[ 'status'] == 'ok', "Requires an 'ok' status, received %s" % ( data_file_response['status']) if data_file_response['type'] == 'file': file_paths.append(data_file_response['path']) return file_paths
def recurse(directory, odds): """ recurse recursively checks each file in directory and sub-directories with odds chance. Odds is a positive decimal that dictates how likely a file is to be tested from 0 (no chance) to 1 or above (100% chance). This function tests for if files can be retrieved with datalad and if they can't, if there is an authentication setup for security. """ # Get all file names in directory files = listdir(directory) # Loop through every file for file_name in files: # If the file name is .git or .datalad, ignore if file_name == ".git" or file_name == ".datalad": continue full_path = join(directory, file_name) # If the file is a directory if isdir(full_path): return recurse(full_path, odds) # If the file is a broken symlink and with odd chance elif not exists(full_path) and random() < odds: msg = api.get(path=full_path, on_failure="ignore", return_type="item-or-list") # Check for authentication if msg["status"] == "error" and "unable to access" not in msg["message"].lower(): return "Cannot download file and didn't hit authentication request for file: " + full_path return "All good"
def test_gh3356(src, path): # create toy version of gh-3356 scenario origin = Dataset(src).create() origin_sub = origin.create(origin.pathobj / 'subdir' / 'subds') for p in ((origin_sub.pathobj / 'data' / 'file_in_annex.txt'), (origin_sub.pathobj / 'data' / 'file_in_annex2.txt')): p.parent.mkdir(parents=True, exist_ok=True) p.write_text(p.name) origin.save(recursive=True) clone = install(path, source=src, result_xfm='datasets', return_type='item-or-list') targetpaths = [ opj('subdir', 'subds', 'data', 'file_in_annex.txt'), opj('subdir', 'subds', 'data', 'file_in_annex2.txt'), ] with chpwd(path): res = get(targetpaths) # get() must report success on two files assert_result_count(res, 2, action='get', type='file', status='ok') # status must report content for two files assert_result_count(clone.status(recursive=True, annex='all', report_filetype='eval'), 2, action='status', has_content=True)
def test_install_known_subdataset(src=None, path=None): _mk_submodule_annex(src, fname="test-annex.dat", fcontent="whatever") # get the superdataset: ds = install(path, source=src) # subdataset not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) assert_in('subm 1', ds.subdatasets(state='absent', result_xfm='relpaths')) assert_not_in('subm 1', ds.subdatasets(state='present', result_xfm='relpaths')) # install it: ds.install('subm 1') ok_(subds.is_installed()) ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False)) # Verify that it is the correct submodule installed and not # new repository initiated assert_in("test-annex.dat", subds.repo.get_indexed_files()), assert_not_in('subm 1', ds.subdatasets(state='absent', result_xfm='relpaths')) assert_in('subm 1', ds.subdatasets(state='present', result_xfm='relpaths')) # now, get the data by reinstalling with -g: ok_(subds.repo.file_has_content('test-annex.dat') is False) with chpwd(ds.path): result = get(path='subm 1', dataset=os.curdir) assert_in_results(result, path=opj(subds.path, 'test-annex.dat')) ok_(subds.repo.file_has_content('test-annex.dat') is True) ok_(subds.is_installed())
def test_get_autoresolve_recurse_subdatasets(src, path): origin = Dataset(src).create() origin_sub = origin.create('sub') origin_subsub = origin_sub.create('subsub') with open(opj(origin_subsub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.save(recursive=True) ds = install(path, source=src, result_xfm='datasets', return_type='item-or-list') eq_(len(ds.subdatasets(fulfilled=True)), 0) with chpwd(ds.path): results = get(opj(ds.path, 'sub'), recursive=True, result_xfm='datasets') eq_(len(ds.subdatasets(fulfilled=True, recursive=True)), 2) subsub = Dataset(opj(ds.path, 'sub', 'subsub')) ok_(subsub.is_installed()) assert_in(subsub, results) # all file handles are fulfilled by default ok_( Dataset(opj(ds.path, 'sub', 'subsub')).repo.file_has_content( "file_in_annex.txt") is True)
def test_install_recursive_repeat(src, path): subsub_src = Dataset(opj(src, 'sub 1', 'subsub')).create(force=True) sub1_src = Dataset(opj(src, 'sub 1')).create(force=True) sub2_src = Dataset(opj(src, 'sub 2')).create(force=True) top_src = Dataset(src).create(force=True) top_src.save(auto_add_changes=True, recursive=True) # install top level: top_ds = install(path, source=src) ok_(top_ds.is_installed() is True) sub1 = Dataset(opj(path, 'sub 1')) ok_(sub1.is_installed() is False) sub2 = Dataset(opj(path, 'sub 2')) ok_(sub2.is_installed() is False) subsub = Dataset(opj(path, 'sub 1', 'subsub')) ok_(subsub.is_installed() is False) # install again, now with data and recursive, but recursion_limit 1: result = get(os.curdir, dataset=path, recursive=True, recursion_limit=1) # top-level dataset was not reobtained assert_not_in(top_ds, result) assert_in(sub1, result) assert_in(sub2, result) assert_not_in(subsub, result) ok_(top_ds.repo.file_has_content('top_file.txt') is True) ok_(sub1.repo.file_has_content('sub1file.txt') is True) ok_(sub2.repo.file_has_content('sub2file.txt') is True) # install sub1 again, recursively and with data top_ds.install('sub 1', recursive=True, get_data=True) ok_(subsub.is_installed()) ok_(subsub.repo.file_has_content('subsubfile.txt'))
def download_files(dataset, dataset_size, *, num=4): filenames, contains_archived_files = get_filenames(dataset, minimum=num) k_smallest = get_approx_ksmallests(dataset, filenames) if len(k_smallest) == 0: return download_size = (dataset_size if contains_archived_files else get_sample_files_size(dataset, k_smallest)) # Set a time limit based on the download size. # Limit between 20 sec and 10 minutes to avoid test to fail/hang. time_limit = int(max(20, min(download_size * 1.2 // 2e6, 600))) responses = [] with timeout(time_limit): for filename in k_smallest: full_path = os.path.join(dataset, filename) responses = api.get(path=full_path, on_failure="ignore") for response in responses: if response.get("status") in ["ok", "notneeded"]: continue if response.get("status") in ["impossible", "error"]: pytest.fail( f"{full_path}\n{response.get('message')}", pytrace=False, ) if not responses: pytest.fail( f"The dataset timed out after {time_limit} seconds before retrieving a file." " Cannot to tell if the download would be sucessful." f"\n{filename} has size of {humanize.naturalsize(get_annexed_file_size(dataset, filename))}.", pytrace=False, )
def test_get_in_unavailable_subdataset(src, path): _make_dataset_hierarchy(src) root = install(path, source=src, result_xfm='datasets', return_type='item-or-list') targetpath = opj('sub1', 'sub2') targetabspath = opj(root.path, targetpath) with chpwd(path): res = get(targetabspath) assert_result_count(res, 2, status='ok', action='install', type='dataset') # dry-fit result filter that only returns the result that matched the requested # path filtered = [r for r in res if only_matching_paths(r, path=targetabspath)] assert_result_count(filtered, 1, status='ok', action='install', type='dataset', path=targetabspath) # we got the dataset, and its immediate content, but nothing below sub2 = Dataset(targetabspath) ok_(sub2.is_installed()) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
def test_install_recursive_repeat(src, path): subsub_src = Dataset(opj(src, 'sub 1', 'subsub')).create(force=True) sub1_src = Dataset(opj(src, 'sub 1')).create(force=True) sub2_src = Dataset(opj(src, 'sub 2')).create(force=True) top_src = Dataset(src).create(force=True) top_src.add('.', recursive=True) ok_clean_git(top_src.path) # install top level: top_ds = install(path, source=src) ok_(top_ds.is_installed() is True) sub1 = Dataset(opj(path, 'sub 1')) ok_(sub1.is_installed() is False) sub2 = Dataset(opj(path, 'sub 2')) ok_(sub2.is_installed() is False) subsub = Dataset(opj(path, 'sub 1', 'subsub')) ok_(subsub.is_installed() is False) # install again, now with data and recursive, but recursion_limit 1: result = get(os.curdir, dataset=path, recursive=True, recursion_limit=1, result_xfm='datasets') # top-level dataset was not reobtained assert_not_in(top_ds, result) assert_in(sub1, result) assert_in(sub2, result) assert_not_in(subsub, result) ok_(top_ds.repo.file_has_content('top_file.txt') is True) ok_(sub1.repo.file_has_content('sub1file.txt') is True) ok_(sub2.repo.file_has_content('sub2file.txt') is True) # install sub1 again, recursively and with data top_ds.install('sub 1', recursive=True, get_data=True) ok_(subsub.is_installed()) ok_(subsub.repo.file_has_content('subsubfile.txt'))
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.save("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.save("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.save("file3.txt") origin_top.save(recursive=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(IncompleteResultsError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) # but by default implicit results are not reported eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure')) # clean up, the nasty way rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): # don't ask for the file content to make return value comparison # simpler result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets') ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [sub, subsub])
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.add("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.add("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.add("file3.txt") origin_top.save(recursive=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(IncompleteResultsError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) # but by default implicit results are not reported eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure')) # clean up, the nasty way rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): # don't ask for the file content to make return value comparison # simpler result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets') ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [sub, subsub])
def test_download(self, dataset): eval_config(dataset) authenticate(dataset) filenames = get_filenames(dataset) if len(filenames) == 0: return True k_smallest = get_approx_ksmallests(dataset, filenames) # Restricted Zenodo datasets require to download the whole archive before # downloading individual files. project = project_name2env(dataset.split("/")[-1]) if os.getenv(project + "_ZENODO_TOKEN", None): with timeout(300): api.get(path=dataset, on_failure="ignore") download_files(dataset, k_smallest)
def test_autoresolve_multiple_datasets(src, path): with chpwd(path): ds1 = install('ds1', source=src) ds2 = install('ds2', source=src) results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat'))) # each ds has one file eq_(len(results), 2) ok_(ds1.repo.file_has_content('test-annex.dat') is True) ok_(ds2.repo.file_has_content('test-annex.dat') is True)
def test_implicit_install(src, dst): origin_top = create(src) origin_sub = origin_top.create("sub") origin_subsub = origin_sub.create("subsub") with open(opj(origin_top.path, "file1.txt"), "w") as f: f.write("content1") origin_top.add("file1.txt") with open(opj(origin_sub.path, "file2.txt"), "w") as f: f.write("content2") origin_sub.add("file2.txt") with open(opj(origin_subsub.path, "file3.txt"), "w") as f: f.write("content3") origin_subsub.add("file3.txt") origin_top.save(auto_add_changes=True) # first, install toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # fail on obscure non-existing one assert_raises(InstallFailedError, ds.install, source='obscure') # install 3rd level and therefore implicitly the 2nd: result = ds.install(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, subsub) # fail on obscure non-existing one in subds assert_raises(InstallFailedError, ds.install, source=opj('sub', 'obscure')) # clean up: rmtree(dst, chmod_files=True) ok_(not exists(dst)) # again first toplevel: ds = install(dst, source=src) ok_(ds.is_installed()) sub = Dataset(opj(ds.path, "sub")) ok_(not sub.is_installed()) subsub = Dataset(opj(sub.path, "subsub")) ok_(not subsub.is_installed()) # now implicit but without an explicit dataset to install into # (deriving from CWD): with chpwd(dst): result = get(path=opj("sub", "subsub")) ok_(sub.is_installed()) ok_(subsub.is_installed()) eq_(result, [subsub])
def test_download(self, dataset): eval_config(dataset) authenticate(dataset) k_smallest = get_approx_ksmallests(dataset, get_filenames(dataset)) # Restricted Zenodo datasets require to download the whole archive before # downloading individual files. project = project_name2env(dataset.split("/")[-1]) if os.getenv(project + "_ZENODO_TOKEN", None): with timeout(300): api.get(path=dataset, on_failure="ignore") download_files(dataset, k_smallest) # Test the download of proper submodules. for submodule in get_proper_submodules(dataset): k_smallest = get_approx_ksmallests(submodule, get_filenames(submodule)) download_files(submodule, k_smallest)
def test_autoresolve_multiple_datasets(src, path): with chpwd(path): ds1 = install( 'ds1', source=src, result_xfm='datasets', return_type='item-or-list') ds2 = install( 'ds2', source=src, result_xfm='datasets', return_type='item-or-list') results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat'))) # each ds has one file assert_result_count(results, 2, type='file', action='get', status='ok') ok_(ds1.repo.file_has_content('test-annex.dat') is True) ok_(ds2.repo.file_has_content('test-annex.dat') is True)
def test_is_installed(src, path): ds = Dataset(path) assert_false(ds.is_installed()) # get a clone: AnnexRepo.clone(src, path) ok_(ds.is_installed()) # submodule still not installed: subds = Dataset(opj(path, 'subm 1')) assert_false(subds.is_installed()) subds.create() # get the submodule # This would init so there is a .git file with symlink info, which is # as we agreed is more pain than gain, so let's use our install which would # do it right, after all we are checking 'is_installed' ;) # from datalad.cmd import Runner # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path) with chpwd(path): get('subm 1') ok_(subds.is_installed()) # wipe it out rmtree(ds.path) assert_false(ds.is_installed())
def test_get_in_unavailable_subdataset(src, path): _make_dataset_hierarchy(src) root = install( path, source=src, result_xfm='datasets', return_type='item-or-list') targetpath = opj('sub1', 'sub2') targetabspath = opj(root.path, targetpath) res = get(targetabspath) assert_result_count(res, 2, status='ok', action='install', type='dataset') # dry-fit result filter that only returns the result that matched the requested # path filtered = [r for r in res if only_matching_paths(r, path=targetabspath)] assert_result_count( filtered, 1, status='ok', action='install', type='dataset', path=targetabspath) # we got the dataset, and its immediate content, but nothing below sub2 = Dataset(targetabspath) ok_(sub2.is_installed()) ok_(sub2.repo.file_has_content('file_in_annex.txt') is True) ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
def test_get_autoresolve_recurse_subdatasets(src, path): origin = Dataset(src).create() origin_sub = origin.create('sub') origin_subsub = origin_sub.create('subsub') with open(opj(origin_subsub.path, 'file_in_annex.txt'), "w") as f: f.write('content') origin.save(recursive=True, all_changes=True) ds = install(path, source=src) eq_(len(ds.get_subdatasets(fulfilled=True)), 0) results = get(opj(ds.path, 'sub'), recursive=True) eq_(len(ds.get_subdatasets(fulfilled=True, recursive=True)), 2) subsub = Dataset(opj(ds.path, 'sub', 'subsub')) ok_(subsub.is_installed()) assert_in(subsub, results) # all file handles are fulfilled by default ok_(Dataset(opj(ds.path, 'sub', 'subsub')).repo.file_has_content( "file_in_annex.txt") is True)
def test_get_invalid_call(path, file_outside): # no argument at all: assert_raises(InsufficientArgumentsError, get, None) assert_raises(InsufficientArgumentsError, get, []) # invalid dataset: assert_status('impossible', get(None, dataset=path, on_failure='ignore')) # have a plain git: ds = Dataset(path) ds.create(no_annex=True) with open(opj(path, "some.txt"), "w") as f: f.write("whatever") ds.save("some.txt", to_git=True, message="Initial commit.") # make it an annex (remove indicator file that create has placed # in the dataset to make it possible): (ds.pathobj / '.noannex').unlink() AnnexRepo(path, init=True, create=True) # call get again on a file in git: result = ds.get("some.txt") assert_status('notneeded', result) # invalid source: # yoh: but now we would need to add it to annex since clever code first # checks what needs to be fetched at all create_tree(path, {'annexed.dat': 'some'}) ds.save("annexed.dat") ds.repo.drop("annexed.dat", options=['--force']) with assert_raises(RemoteNotAvailableError) as ce: ds.get("annexed.dat", source='MysteriousRemote') eq_("MysteriousRemote", ce.exception.remote) res = ds.get("NotExistingFile.txt", on_failure='ignore') assert_status('impossible', res) assert_message("path does not exist", res) # path outside repo errors as with most other commands: res = ds.get(file_outside, on_failure='ignore') assert_in_results( res, status='impossible', message='path not associated with any dataset')