Esempio n. 1
0
def fetch_data(tmpdir, subject):
    """Fetches some test dicoms using datalad"""
    from datalad import api
    targetdir = op.join(tmpdir, 'QA')
    api.install(path=targetdir, source='http://datasets-tests.datalad.org/dbic/QA')
    api.get('{}/sourcedata/{}'.format(targetdir, subject))
    return targetdir
Esempio n. 2
0
def fetch_data(tmpdir, dicoms):
    """Fetches some test DICOMs using datalad"""
    data = os.path.join(tmpdir, 'data')
    api.install(path=data, source=DICOM_DIR)
    data = os.path.join(data, dicoms)
    api.get(path=data)
    return data
Esempio n. 3
0
def fetch_data(tmpdir, dataset, getpath=None):
    """
    Utility function to interface with datalad database.
    Performs datalad `install` and datalad `get` operations.

    Parameters
    ----------
    tmpdir : str
        directory to temporarily store data
    dataset : str
        dataset path from `http://datasets-tests.datalad.org`
    getpath : str [optional]
        exclusive path to get

    Returns
    -------
    targetdir : str
        directory with installed dataset
    """
    from datalad import api
    targetdir = op.join(tmpdir, op.basename(dataset))
    api.install(path=targetdir,
                source='http://datasets-tests.datalad.org/{}'.format(dataset))

    getdir = targetdir + (op.sep + getpath if getpath is not None else '')
    api.get(getdir)
    return targetdir
Esempio n. 4
0
def test_install_crcns(tdir, ds_path):
    with chpwd(tdir):
        with swallow_logs(new_level=logging.INFO) as cml:
            install("all-nonrecursive", source='///')
            # since we didn't log decorations such as log level atm while
            # swallowing so lets check if exit code is returned or not
            # I will test both
            assert_not_in('ERROR', cml.out)
            # below one must not fail alone! ;)
            assert_not_in('with exit code', cml.out)

        # should not hang in infinite recursion
        with chpwd('all-nonrecursive'):
            get("crcns")
        ok_(exists(_path_("all-nonrecursive/crcns/.git/config")))
        # and we could repeat installation and get the same result
        ds1 = install(_path_("all-nonrecursive/crcns"))
        ds2 = Dataset('all-nonrecursive').install('crcns')
        ok_(ds1.is_installed())
        eq_(ds1, ds2)
        eq_(ds1.path, ds2.path)  # to make sure they are a single dataset

    # again, but into existing dataset:
    ds = create(ds_path)
    crcns = ds.install("///crcns")
    ok_(crcns.is_installed())
    eq_(crcns.path, opj(ds_path, "crcns"))
    assert_in(crcns.path, ds.get_subdatasets(absolute=True))
Esempio n. 5
0
def fetch_data(tmpdir, subject):
    """Fetches some test dicoms using datalad"""
    from datalad import api
    targetdir = op.join(tmpdir, 'QA')
    api.install(path=targetdir, source='///dbic/QA')
    api.get('{}/sourcedata/{}'.format(targetdir, subject))
    return targetdir
Esempio n. 6
0
def test_real_data(infile):
    dl.get(infile)
    data = np.recfromcsv(infile,
                         delimiter='\t',
                         names=['x', 'y', 'pupil', 'frame'])

    clf = d.EyegazeClassifier(
        #px2deg=0.0185581232561,
        px2deg=0.0266711972026,
        sampling_rate=1000.0)
    p = clf.preproc(data)

    events = clf(p[:50000],
                 #p,
                 )

    evdf = ut.events2df(events)

    labels = list(evdf['label'])
    # find all kinds of events
    for t in ('FIXA', 'PURS', 'SACC', 'LPSO', 'HPSO', 'ISAC', 'IHPS'):
        # 'ILPS' one file doesn't have any
        assert t in labels
    return

    ut.show_gaze(pp=p[:50000], events=events)
    #ut.show_gaze(pp=p, events=events)
    import pylab as pl
    saccades = evdf[evdf['label'] == 'SACC']
    isaccades = evdf[evdf['label'] == 'ISAC']
    print('#saccades', len(saccades), len(isaccades))
    pl.plot(saccades['amp'], saccades['peak_vel'], '.', alpha=.3)
    pl.plot(isaccades['amp'], isaccades['peak_vel'], '.', alpha=.3)
    pl.show()
Esempio n. 7
0
    def check_comparison_dir(self):
        cmpr_path = self.data.comparison_dir
        if not cmpr_path.exists():
            raise ValueError(
                "The following path does not exist but is required to "
                f"perform a test:{cmpr_path}.\n You may wish to run the "
                "test with the --create_sample_output flag or generate "
                "output for future test sessions with "
                "--save_sample_output. ")
        cmpr_files = list(cmpr_path.glob("**/*"))
        cmpr_files_rel = [f.relative_to(cmpr_path) for f in cmpr_files]

        files_required = [
            f.relative_to(self.data.outdir) for f in self.file_list
        ]
        missing_files = []
        for f in files_required:
            if not f in cmpr_files_rel:
                missing_files.append(str(cmpr_path / f))
        if missing_files:
            m_str = " ".join(missing_files)
            raise ValueError(
                "The following files are missing and are required to "
                f"fully complete the test: {m_str} ")

        need_data = any(p.is_symlink() and not p.exists() for p in cmpr_files)
        if need_data:
            datalad.get(str(cmpr_path))
Esempio n. 8
0
def fetch_data(tmpdir, dicoms):
    """Fetches some test DICOMs using datalad"""
    data = os.path.join(tmpdir, 'data')
    api.install(path=data, source=DICOM_DIR)
    data = os.path.join(data, dicoms)
    api.get(path=data)
    return data
Esempio n. 9
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    # We must not be able to create a new repository under a known
    # subdataset path.
    # Note: Unfortunately we would still be able to generate it under
    # subdirectory within submodule, e.g. `subm 1/subdir` but that is
    # not checked here. `rev-create` will provide that protection
    # when create/rev-create merge.
    with assert_raises(PathKnownToRepositoryError):
        subds.create()
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Esempio n. 10
0
def test_install_crcns(tdir, ds_path):
    with chpwd(tdir):
        with swallow_logs(new_level=logging.INFO) as cml:
            install("all-nonrecursive", source='///')
            # since we didn't log decorations such as log level atm while
            # swallowing so lets check if exit code is returned or not
            # I will test both
            assert_not_in('ERROR', cml.out)
            # below one must not fail alone! ;)
            assert_not_in('with exit code', cml.out)

        # should not hang in infinite recursion
        with chpwd('all-nonrecursive'):
            get("crcns")
        ok_(exists(_path_("all-nonrecursive/crcns/.git/config")))
        # and we could repeat installation and get the same result
        ds1 = install(_path_("all-nonrecursive/crcns"))
        ds2 = Dataset('all-nonrecursive').install('crcns')
        ok_(ds1.is_installed())
        eq_(ds1, ds2)
        eq_(ds1.path, ds2.path)  # to make sure they are a single dataset

    # again, but into existing dataset:
    ds = create(ds_path)
    crcns = ds.install("///crcns")
    ok_(crcns.is_installed())
    eq_(crcns.path, opj(ds_path, "crcns"))
    assert_in(crcns.path, ds.get_subdatasets(absolute=True))
Esempio n. 11
0
def savegaze():
    """
    small function to generate and save remodnav classification figures
    """
    from remodnav.tests import utils as ut
    import pylab as pl
    import datalad.api as dl

    # use two examplary files (lab + MRI) used during testing as well
    # hardcoding those, as I see no reason for updating them
    infiles = [
        op.join(
            'data',
            'raw_eyegaze', 'sub-32', 'beh',
            'sub-32_task-movie_run-5_recording-eyegaze_physio.tsv.gz'),
        op.join(
            'data',
            'raw_eyegaze', 'sub-02', 'ses-movie',  'func',
            'sub-02_ses-movie_task-movie_run-5_recording-eyegaze_physio.tsv.gz'
        ),
    ]
    # one call per file due to https://github.com/datalad/datalad/issues/3356
    for f in infiles:
        dl.get(f)
    for f in infiles:
        # read data
        data = np.recfromcsv(f,
                             delimiter='\t',
                             names=['x', 'y', 'pupil', 'frame'])

        # adjust px2deg conversion factor according to datafile
        pxdeg, ext = (0.0266711972026, 'lab') if '32' in f \
            else (0.0185581232561, 'mri')
        clf = EyegazeClassifier(
            px2deg=pxdeg,
            sampling_rate=1000.0)
        p = clf.preproc(data)
        # lets go with 10 seconds to actually see details. This particular time
        # window is within the originally plotted 50s and contains missing data
        # for both data types (lab & mri)
        events = clf(p[15000:25000])

        fig = pl.figure(
            # fake size to get the font size down in relation
            figsize=(14, 2),
            dpi=120,
            frameon=False)
        ut.show_gaze(
            pp=p[15000:25000],
            events=events,
            sampling_rate=1000.0,
            show_vels=True,
            coord_lim=(0, 1280),
            vel_lim=(0, 1000))
        pl.savefig(
            op.join('img', 'remodnav_{}.svg'.format(ext)),
            transparent=True,
            bbox_inches="tight")
        pl.close()
Esempio n. 12
0
def plot_dist(figures):
    """
    Plot the events duration distribution per movie run, per data set.
    """
    import pandas as pd

    # do nothing if we don't want to plot
    if not figures:
        return

    import datalad.api as dl
    dl.install(op.join('data', 'studyforrest-data-eyemovementlabels'))
    datapath = op.join('data',
                       'studyforrest-data-eyemovementlabels',
                       'sub*',
                       '*.tsv')

    data = sorted(glob(datapath))
    dl.get(dataset='.', path=data)

    for ds, ds_name in [(mri_ids, 'mri'), (lab_ids, 'lab')]:
        dfs = [
            pd.read_csv(f, header=0, delim_whitespace=True)
            for f in data
            if any('sub-{}'.format(i) in f for i in ds)
        ]
        df = pd.concat(dfs)
        # thats a concatinated dataframe with all files from one dataset (lab or mri)
        # extract relevant event types
        SACs = df[(df.label == 'SACC') | (df.label == 'ISACS')]
        FIX = df[df.label == 'FIXA']
        PSOs = df[(df.label == 'HPSO') | (df.label == 'IHPS') | (df.label == 'LPSO') | (df.label == 'ILPS')]
        PURs = df[df.label == 'PURS']
        # plot a histogram. Set the same x-axis limits as NH for fixations and saccades,
        # and exclude outlying 0.5% for other events
        for (ev_df, label, x_lim, y_lim) in [
                (SACs, 'saccade', (0, 0.16), (1, 62000)),
                (FIX, 'fixation', (0, 1.0), (1, 50000)),
                (PSOs, 'PSO', (0, 0.04), (1, 26000)),
                (PURs, 'pursuit', (0, .8), (1, 30000))]:
            fig = pl.figure(figsize=(3,2))
            pl.hist(ev_df['duration'].values,
                    bins='doane',
                    range=x_lim,
                    color='gray')
                    #log=True)
            pl.xlabel('{} duration in s'.format(label))
            pl.xlim(x_lim)
            pl.ylim(y_lim)
            pl.savefig(
                op.join(
                    'img',
                    'hist_{}_{}.svg'.format(
                        label,
                        ds_name)),
                transparent=True,
                bbox_inches="tight")
            pl.close()
Esempio n. 13
0
def test_cmdline(infile, tmpdir):
    import remodnav
    dl.get(infile)
    outfname = tmpdir.mkdir('bids').join("events.tsv").strpath

    remodnav.main(['fake', infile, outfname, '0.0266711972026', '1000'])

    assert op.exists(outfname)
    assert op.exists(outfname[:-4] + '.png')
Esempio n. 14
0
 def _fetch_data(datadir, dicoms):
     try:
         """Fetches some test DICOMs using datalad"""
         api.install(path=datadir, source=DICOM_DIR)
         data = os.path.join(datadir, dicoms)
         api.get(path=data)
     except IncompleteResultsError as exc:
         pytest.skip("Failed to fetch test data: %s" % str(exc))
     return data
Esempio n. 15
0
def main():
    """Entry point"""
    thispath = os.getcwd()
    opts = get_parser().parse_args()
    np.random.seed(opts.seed)

    out_file = None
    if opts.output_file is not None:
        out_file = os.path.abspath(opts.output_file)

    os.chdir(opts.openfmri_dir)
    all_sub = sorted(glob.glob('ds*/sub-*'))
    datasets = {}
    multises = set()
    for subj in all_sub:
        ds = subj.split('/')[0]
        if os.path.isdir(os.path.join(subj, 'anat')) and os.path.isdir(os.path.join(subj, 'func')):
            datasets.setdefault(ds, []).append(os.path.basename(subj))
        elif (glob.glob(os.path.join(subj, 'ses-*', 'anat')) and
              glob.glob(os.path.join(subj, 'ses-*', 'func'))):
            multises.add(ds)
            datasets.setdefault(ds, []).append(os.path.basename(subj))

    subsample = {}

    n_sample = 0
    for ds, sublist in datasets.items():
        n_sample += min(opts.num_participants, len(sublist))
        if len(sublist) <= opts.num_participants:
            subsample[ds] = sublist
        else:
            subsample[ds] = sorted(np.random.choice(
                sublist, size=opts.num_participants, replace=False).tolist())

    # Double check everything looks good
    assert n_sample == len([sub for _, sublist in datasets.items() for sub in sublist])

    if out_file is not None:
        import yaml
        with open(out_file, 'w') as outfh:
            outfh.write(yaml.dump(subsample))
        print('Sampled participants stored to %s' % out_file)

    singleses = set(datasets.keys()) - multises
    print('Sampled %d participants' % n_sample)
    print('Datasets summary:\n\tSingle-session=%d'
          '\n\tMulti-session=%d'
          '\n\tTotal participants=%d' % (len(singleses), len(multises), n_sample))
    os.chdir(thispath)

    if opts.datalad_fetch:
        import datalad.api as dlad
        for ds, sublist in subsample.items():
            for sub in sublist:
                dlad.get(path=os.path.join(opts.openfmri_dir, ds, sub),
                         recursive=True, jobs=opts.njobs, verbose=True)
Esempio n. 16
0
def test_get_in_unavailable_subdataset(src, path):
    origin_ds = _make_dataset_hierarchy(src)
    root = install(path, source=src)
    targetpath = opj('sub1', 'sub2')
    targetabspath = opj(root.path, targetpath)
    get(targetabspath)
    # we got the dataset, and its immediate content, but nothing below
    sub2 = Dataset(targetabspath)
    ok_(sub2.is_installed())
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
Esempio n. 17
0
def test_get_in_unavailable_subdataset(src, path):
    origin_ds = _make_dataset_hierarchy(src)
    root = install(path, source=src)
    targetpath = opj('sub1', 'sub2')
    targetabspath = opj(root.path, targetpath)
    get(targetabspath)
    # we got the dataset, and its immediate content, but nothing below
    sub2 = Dataset(targetabspath)
    ok_(sub2.is_installed())
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
def read_data(data):
    """
    Get and read in data.
    """
    # if the data is not retrieved, get it using datalad get.
    dl.get(data)
    # read data into a pandas dataframe
    df = pd.read_csv(data)
    attributes = [
        "sepal_length", "sepal_width", "petal_length", "petal_width", "class"
    ]
    df.columns = attributes
    return df
Esempio n. 19
0
def _datalad_get(filepath):
    if not filepath:
        return

    from datalad import api
    from datalad.support.exceptions import IncompleteResultsError

    try:
        api.get(str(filepath))
    except IncompleteResultsError as exc:
        if exc.failed[0]['message'] == 'path not associated with any dataset':
            from .conf import TF_GITHUB_SOURCE
            api.install(path=TF_LAYOUT.root, source=TF_GITHUB_SOURCE, recursive=True)
            api.get(str(filepath))
        else:
            raise
Esempio n. 20
0
def test_install_known_subdataset(src, path):

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Esempio n. 21
0
def download_files(dataset, filenames, time_limit=120):
    if len(filenames) == 0:
        return

    responses = []
    with timeout(time_limit):
        for filename in filenames:
            full_path = os.path.join(dataset, filename)
            responses = api.get(path=full_path, on_failure="ignore")

            for response in responses:
                if response.get("status") in ["ok", "notneeded"]:
                    continue
                if response.get("status") in ["impossible", "error"]:
                    pytest.fail(
                        f"{full_path}\n{response.get('message')}", pytrace=False
                    )

    if not responses:
        pytest.fail(
            f"The dataset timed out after {time_limit} seconds before retrieving a file."
            " Cannot to tell if the download would be sucessful."
            f"\n{filename} has size of {humanize.naturalsize(get_annexed_file_size(dataset, filename))}.",
            pytrace=False,
        )
def get_dataset_data(ds, path_to_get, verbose=False, parallelized=None):
    """
    Gets data from dataset (ds) using datalad.api.get()
    Returns a list of file paths to the files that would have been
    downloaded by this command, even if they already existed in the filesystem

    Throws exception if a bad status is returned from datalad.api.get()

    parallelized is either 'None' or an integer describing the number of jobs to
    use (passed directly to datalad.api.get)

    If verbose is True, datalad.api.get will print out the list of files it is
    downloading in json pretty-print
    """
    get_kwargs = {'path': path_to_get, 'dataset': ds, 'jobs': parallelized}
    if verbose:
        get_kwargs['result_renderer'] = 'json_pp'
    specific_data = api.get(**get_kwargs)
    file_paths = []
    for data_file_response in specific_data:
        assert data_file_response[
            'status'] == 'ok', "Requires an 'ok' status, received %s" % (
                data_file_response['status'])
        if data_file_response['type'] == 'file':
            file_paths.append(data_file_response['path'])
    return file_paths
Esempio n. 23
0
def recurse(directory, odds):
    """
    recurse recursively checks each file in directory and sub-directories with odds chance.
    Odds is a positive decimal that dictates how likely a file is to be tested from 0 (no chance) to 1 or
    above (100% chance). This function tests for if files can be retrieved with datalad and if they can't,
    if there is an authentication setup for security.
    """

    # Get all file names in directory
    files = listdir(directory)

    # Loop through every file
    for file_name in files:

        # If the file name is .git or .datalad, ignore
        if file_name == ".git" or file_name == ".datalad":
            continue

        full_path = join(directory, file_name)

        # If the file is a directory
        if isdir(full_path):

            return recurse(full_path, odds)

        # If the file is a broken symlink and with odd chance
        elif not exists(full_path) and random() < odds:
            msg = api.get(path=full_path, on_failure="ignore", return_type="item-or-list")

            # Check for authentication
            if msg["status"] == "error" and "unable to access" not in msg["message"].lower():
                return "Cannot download file and didn't hit authentication request for file: " + full_path

    return "All good"
Esempio n. 24
0
def test_gh3356(src, path):
    # create toy version of gh-3356 scenario
    origin = Dataset(src).create()
    origin_sub = origin.create(origin.pathobj / 'subdir' / 'subds')
    for p in ((origin_sub.pathobj / 'data' / 'file_in_annex.txt'),
              (origin_sub.pathobj / 'data' / 'file_in_annex2.txt')):
        p.parent.mkdir(parents=True, exist_ok=True)
        p.write_text(p.name)
    origin.save(recursive=True)
    clone = install(path,
                    source=src,
                    result_xfm='datasets',
                    return_type='item-or-list')
    targetpaths = [
        opj('subdir', 'subds', 'data', 'file_in_annex.txt'),
        opj('subdir', 'subds', 'data', 'file_in_annex2.txt'),
    ]
    with chpwd(path):
        res = get(targetpaths)
    # get() must report success on two files
    assert_result_count(res, 2, action='get', type='file', status='ok')
    # status must report content for two files
    assert_result_count(clone.status(recursive=True,
                                     annex='all',
                                     report_filetype='eval'),
                        2,
                        action='status',
                        has_content=True)
Esempio n. 25
0
def test_install_known_subdataset(src=None, path=None):

    _mk_submodule_annex(src, fname="test-annex.dat", fcontent="whatever")

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(state='absent', result_xfm='relpaths'))
    assert_not_in('subm 1',
                  ds.subdatasets(state='present', result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    assert_in("test-annex.dat", subds.repo.get_indexed_files()),
    assert_not_in('subm 1',
                  ds.subdatasets(state='absent', result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(state='present', result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Esempio n. 26
0
def test_get_autoresolve_recurse_subdatasets(src, path):

    origin = Dataset(src).create()
    origin_sub = origin.create('sub')
    origin_subsub = origin_sub.create('subsub')
    with open(opj(origin_subsub.path, 'file_in_annex.txt'), "w") as f:
        f.write('content')
    origin.save(recursive=True)

    ds = install(path,
                 source=src,
                 result_xfm='datasets',
                 return_type='item-or-list')
    eq_(len(ds.subdatasets(fulfilled=True)), 0)

    with chpwd(ds.path):
        results = get(opj(ds.path, 'sub'),
                      recursive=True,
                      result_xfm='datasets')
    eq_(len(ds.subdatasets(fulfilled=True, recursive=True)), 2)
    subsub = Dataset(opj(ds.path, 'sub', 'subsub'))
    ok_(subsub.is_installed())
    assert_in(subsub, results)
    # all file handles are fulfilled by default
    ok_(
        Dataset(opj(ds.path, 'sub', 'subsub')).repo.file_has_content(
            "file_in_annex.txt") is True)
Esempio n. 27
0
def test_install_recursive_repeat(src, path):
    subsub_src = Dataset(opj(src, 'sub 1', 'subsub')).create(force=True)
    sub1_src = Dataset(opj(src, 'sub 1')).create(force=True)
    sub2_src = Dataset(opj(src, 'sub 2')).create(force=True)
    top_src = Dataset(src).create(force=True)
    top_src.save(auto_add_changes=True, recursive=True)

    # install top level:
    top_ds = install(path, source=src)
    ok_(top_ds.is_installed() is True)
    sub1 = Dataset(opj(path, 'sub 1'))
    ok_(sub1.is_installed() is False)
    sub2 = Dataset(opj(path, 'sub 2'))
    ok_(sub2.is_installed() is False)
    subsub = Dataset(opj(path, 'sub 1', 'subsub'))
    ok_(subsub.is_installed() is False)

    # install again, now with data and recursive, but recursion_limit 1:
    result = get(os.curdir, dataset=path, recursive=True, recursion_limit=1)
    # top-level dataset was not reobtained
    assert_not_in(top_ds, result)
    assert_in(sub1, result)
    assert_in(sub2, result)
    assert_not_in(subsub, result)
    ok_(top_ds.repo.file_has_content('top_file.txt') is True)
    ok_(sub1.repo.file_has_content('sub1file.txt') is True)
    ok_(sub2.repo.file_has_content('sub2file.txt') is True)

    # install sub1 again, recursively and with data
    top_ds.install('sub 1', recursive=True, get_data=True)
    ok_(subsub.is_installed())
    ok_(subsub.repo.file_has_content('subsubfile.txt'))
Esempio n. 28
0
def download_files(dataset, dataset_size, *, num=4):
    filenames, contains_archived_files = get_filenames(dataset, minimum=num)
    k_smallest = get_approx_ksmallests(dataset, filenames)

    if len(k_smallest) == 0:
        return

    download_size = (dataset_size if contains_archived_files else
                     get_sample_files_size(dataset, k_smallest))
    # Set a time limit based on the download size.
    # Limit between 20 sec and 10 minutes to avoid test to fail/hang.
    time_limit = int(max(20, min(download_size * 1.2 // 2e6, 600)))

    responses = []
    with timeout(time_limit):
        for filename in k_smallest:
            full_path = os.path.join(dataset, filename)
            responses = api.get(path=full_path, on_failure="ignore")

            for response in responses:
                if response.get("status") in ["ok", "notneeded"]:
                    continue
                if response.get("status") in ["impossible", "error"]:
                    pytest.fail(
                        f"{full_path}\n{response.get('message')}",
                        pytrace=False,
                    )

    if not responses:
        pytest.fail(
            f"The dataset timed out after {time_limit} seconds before retrieving a file."
            " Cannot to tell if the download would be sucessful."
            f"\n{filename} has size of {humanize.naturalsize(get_annexed_file_size(dataset, filename))}.",
            pytrace=False,
        )
Esempio n. 29
0
def test_get_in_unavailable_subdataset(src, path):
    _make_dataset_hierarchy(src)
    root = install(path,
                   source=src,
                   result_xfm='datasets',
                   return_type='item-or-list')
    targetpath = opj('sub1', 'sub2')
    targetabspath = opj(root.path, targetpath)
    with chpwd(path):
        res = get(targetabspath)
    assert_result_count(res, 2, status='ok', action='install', type='dataset')
    # dry-fit result filter that only returns the result that matched the requested
    # path
    filtered = [r for r in res if only_matching_paths(r, path=targetabspath)]
    assert_result_count(filtered,
                        1,
                        status='ok',
                        action='install',
                        type='dataset',
                        path=targetabspath)
    # we got the dataset, and its immediate content, but nothing below
    sub2 = Dataset(targetabspath)
    ok_(sub2.is_installed())
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
Esempio n. 30
0
def test_install_known_subdataset(src, path):

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Esempio n. 31
0
def test_install_recursive_repeat(src, path):
    subsub_src = Dataset(opj(src, 'sub 1', 'subsub')).create(force=True)
    sub1_src = Dataset(opj(src, 'sub 1')).create(force=True)
    sub2_src = Dataset(opj(src, 'sub 2')).create(force=True)
    top_src = Dataset(src).create(force=True)
    top_src.add('.', recursive=True)
    ok_clean_git(top_src.path)

    # install top level:
    top_ds = install(path, source=src)
    ok_(top_ds.is_installed() is True)
    sub1 = Dataset(opj(path, 'sub 1'))
    ok_(sub1.is_installed() is False)
    sub2 = Dataset(opj(path, 'sub 2'))
    ok_(sub2.is_installed() is False)
    subsub = Dataset(opj(path, 'sub 1', 'subsub'))
    ok_(subsub.is_installed() is False)

    # install again, now with data and recursive, but recursion_limit 1:
    result = get(os.curdir, dataset=path, recursive=True, recursion_limit=1,
                 result_xfm='datasets')
    # top-level dataset was not reobtained
    assert_not_in(top_ds, result)
    assert_in(sub1, result)
    assert_in(sub2, result)
    assert_not_in(subsub, result)
    ok_(top_ds.repo.file_has_content('top_file.txt') is True)
    ok_(sub1.repo.file_has_content('sub1file.txt') is True)
    ok_(sub2.repo.file_has_content('sub2file.txt') is True)

    # install sub1 again, recursively and with data
    top_ds.install('sub 1', recursive=True, get_data=True)
    ok_(subsub.is_installed())
    ok_(subsub.repo.file_has_content('subsubfile.txt'))
Esempio n. 32
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.save("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.save("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.save("file3.txt")
    origin_top.save(recursive=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(IncompleteResultsError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    # but by default implicit results are not reported
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure'))

    # clean up, the nasty way
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        # don't ask for the file content to make return value comparison
        # simpler
        result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets')
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [sub, subsub])
Esempio n. 33
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.add("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.add("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.add("file3.txt")
    origin_top.save(recursive=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(IncompleteResultsError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    # but by default implicit results are not reported
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(IncompleteResultsError, ds.install, source=opj('sub', 'obscure'))

    # clean up, the nasty way
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        # don't ask for the file content to make return value comparison
        # simpler
        result = get(path=opj("sub", "subsub"), get_data=False, result_xfm='datasets')
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [sub, subsub])
Esempio n. 34
0
    def test_download(self, dataset):
        eval_config(dataset)
        authenticate(dataset)

        filenames = get_filenames(dataset)
        if len(filenames) == 0:
            return True

        k_smallest = get_approx_ksmallests(dataset, filenames)

        # Restricted Zenodo datasets require to download the whole archive before
        # downloading individual files.
        project = project_name2env(dataset.split("/")[-1])
        if os.getenv(project + "_ZENODO_TOKEN", None):
            with timeout(300):
                api.get(path=dataset, on_failure="ignore")
        download_files(dataset, k_smallest)
Esempio n. 35
0
def test_autoresolve_multiple_datasets(src, path):
    with chpwd(path):
        ds1 = install('ds1', source=src)
        ds2 = install('ds2', source=src)
        results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat')))
        # each ds has one file
        eq_(len(results), 2)
        ok_(ds1.repo.file_has_content('test-annex.dat') is True)
        ok_(ds2.repo.file_has_content('test-annex.dat') is True)
Esempio n. 36
0
def test_autoresolve_multiple_datasets(src, path):
    with chpwd(path):
        ds1 = install('ds1', source=src)
        ds2 = install('ds2', source=src)
        results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat')))
        # each ds has one file
        eq_(len(results), 2)
        ok_(ds1.repo.file_has_content('test-annex.dat') is True)
        ok_(ds2.repo.file_has_content('test-annex.dat') is True)
Esempio n. 37
0
def test_implicit_install(src, dst):

    origin_top = create(src)
    origin_sub = origin_top.create("sub")
    origin_subsub = origin_sub.create("subsub")
    with open(opj(origin_top.path, "file1.txt"), "w") as f:
        f.write("content1")
    origin_top.add("file1.txt")
    with open(opj(origin_sub.path, "file2.txt"), "w") as f:
        f.write("content2")
    origin_sub.add("file2.txt")
    with open(opj(origin_subsub.path, "file3.txt"), "w") as f:
        f.write("content3")
    origin_subsub.add("file3.txt")
    origin_top.save(auto_add_changes=True)

    # first, install toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())

    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # fail on obscure non-existing one
    assert_raises(InstallFailedError, ds.install, source='obscure')

    # install 3rd level and therefore implicitly the 2nd:
    result = ds.install(path=opj("sub", "subsub"))
    ok_(sub.is_installed())
    ok_(subsub.is_installed())
    eq_(result, subsub)

    # fail on obscure non-existing one in subds
    assert_raises(InstallFailedError, ds.install, source=opj('sub', 'obscure'))

    # clean up:
    rmtree(dst, chmod_files=True)
    ok_(not exists(dst))

    # again first toplevel:
    ds = install(dst, source=src)
    ok_(ds.is_installed())
    sub = Dataset(opj(ds.path, "sub"))
    ok_(not sub.is_installed())
    subsub = Dataset(opj(sub.path, "subsub"))
    ok_(not subsub.is_installed())

    # now implicit but without an explicit dataset to install into
    # (deriving from CWD):
    with chpwd(dst):
        result = get(path=opj("sub", "subsub"))
        ok_(sub.is_installed())
        ok_(subsub.is_installed())
        eq_(result, [subsub])
Esempio n. 38
0
    def test_download(self, dataset):
        eval_config(dataset)
        authenticate(dataset)

        k_smallest = get_approx_ksmallests(dataset, get_filenames(dataset))

        # Restricted Zenodo datasets require to download the whole archive before
        # downloading individual files.
        project = project_name2env(dataset.split("/")[-1])
        if os.getenv(project + "_ZENODO_TOKEN", None):
            with timeout(300):
                api.get(path=dataset, on_failure="ignore")
        download_files(dataset, k_smallest)

        # Test the download of proper submodules.
        for submodule in get_proper_submodules(dataset):
            k_smallest = get_approx_ksmallests(submodule,
                                               get_filenames(submodule))
            download_files(submodule, k_smallest)
Esempio n. 39
0
def test_autoresolve_multiple_datasets(src, path):
    with chpwd(path):
        ds1 = install(
            'ds1', source=src,
            result_xfm='datasets', return_type='item-or-list')
        ds2 = install(
            'ds2', source=src,
            result_xfm='datasets', return_type='item-or-list')
        results = get([opj('ds1', 'test-annex.dat')] + glob(opj('ds2', '*.dat')))
        # each ds has one file
        assert_result_count(results, 2, type='file', action='get', status='ok')
        ok_(ds1.repo.file_has_content('test-annex.dat') is True)
        ok_(ds2.repo.file_has_content('test-annex.dat') is True)
Esempio n. 40
0
def test_is_installed(src, path):
    ds = Dataset(path)
    assert_false(ds.is_installed())

    # get a clone:
    AnnexRepo.clone(src, path)
    ok_(ds.is_installed())
    # submodule still not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    subds.create()
    # get the submodule
    # This would init so there is a .git file with symlink info, which is
    # as we agreed is more pain than gain, so let's use our install which would
    # do it right, after all we are checking 'is_installed' ;)
    # from datalad.cmd import Runner
    # Runner().run(['git', 'submodule', 'update', '--init', 'subm 1'], cwd=path)
    with chpwd(path):
        get('subm 1')
    ok_(subds.is_installed())
    # wipe it out
    rmtree(ds.path)
    assert_false(ds.is_installed())
Esempio n. 41
0
def test_get_in_unavailable_subdataset(src, path):
    _make_dataset_hierarchy(src)
    root = install(
        path, source=src,
        result_xfm='datasets', return_type='item-or-list')
    targetpath = opj('sub1', 'sub2')
    targetabspath = opj(root.path, targetpath)
    res = get(targetabspath)
    assert_result_count(res, 2, status='ok', action='install', type='dataset')
    # dry-fit result filter that only returns the result that matched the requested
    # path
    filtered = [r for r in res if only_matching_paths(r, path=targetabspath)]
    assert_result_count(
        filtered, 1, status='ok', action='install', type='dataset',
        path=targetabspath)
    # we got the dataset, and its immediate content, but nothing below
    sub2 = Dataset(targetabspath)
    ok_(sub2.is_installed())
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not Dataset(opj(targetabspath, 'sub3')).is_installed())
Esempio n. 42
0
def test_get_autoresolve_recurse_subdatasets(src, path):

    origin = Dataset(src).create()
    origin_sub = origin.create('sub')
    origin_subsub = origin_sub.create('subsub')
    with open(opj(origin_subsub.path, 'file_in_annex.txt'), "w") as f:
        f.write('content')
    origin.save(recursive=True, all_changes=True)

    ds = install(path, source=src)
    eq_(len(ds.get_subdatasets(fulfilled=True)), 0)

    results = get(opj(ds.path, 'sub'), recursive=True)
    eq_(len(ds.get_subdatasets(fulfilled=True, recursive=True)), 2)
    subsub = Dataset(opj(ds.path, 'sub', 'subsub'))
    ok_(subsub.is_installed())
    assert_in(subsub, results)
    # all file handles are fulfilled by default
    ok_(Dataset(opj(ds.path, 'sub', 'subsub')).repo.file_has_content(
        "file_in_annex.txt") is True)
Esempio n. 43
0
def test_get_invalid_call(path, file_outside):

    # no argument at all:
    assert_raises(InsufficientArgumentsError, get, None)
    assert_raises(InsufficientArgumentsError, get, [])
    # invalid dataset:
    assert_status('impossible', get(None, dataset=path, on_failure='ignore'))

    # have a plain git:
    ds = Dataset(path)
    ds.create(no_annex=True)
    with open(opj(path, "some.txt"), "w") as f:
        f.write("whatever")
    ds.save("some.txt", to_git=True, message="Initial commit.")

    # make it an annex (remove indicator file that create has placed
    # in the dataset to make it possible):
    (ds.pathobj / '.noannex').unlink()
    AnnexRepo(path, init=True, create=True)
    # call get again on a file in git:
    result = ds.get("some.txt")
    assert_status('notneeded', result)

    # invalid source:
    # yoh:  but now we would need to add it to annex since clever code first
    # checks what needs to be fetched at all
    create_tree(path, {'annexed.dat': 'some'})
    ds.save("annexed.dat")
    ds.repo.drop("annexed.dat", options=['--force'])
    with assert_raises(RemoteNotAvailableError) as ce:
        ds.get("annexed.dat", source='MysteriousRemote')
    eq_("MysteriousRemote", ce.exception.remote)

    res = ds.get("NotExistingFile.txt", on_failure='ignore')
    assert_status('impossible', res)
    assert_message("path does not exist", res)

    # path outside repo errors as with most other commands:
    res = ds.get(file_outside, on_failure='ignore')
    assert_in_results(
        res, status='impossible', message='path not associated with any dataset')