Ejemplo n.º 1
0
def test_plugin_config(path):
    # baseline behavior, empty datasets on create
    ds = create(dataset=opj(path, 'ds1'))
    eq_(sorted(os.listdir(ds.path)), ['.datalad', '.git', '.gitattributes'])
    # now we configure a plugin to run twice after `create`
    cfg.add('datalad.create.run-after',
            'add_readme filename=after1.txt',
            where='global')
    cfg.add('datalad.create.run-after',
            'add_readme filename=after2.txt',
            where='global')
    # force reload to pick up newly populated .gitconfig
    cfg.reload(force=True)
    assert_in('datalad.create.run-after', cfg)
    # and now we create a dataset and expect the two readme files
    # to be part of it
    ds = create(dataset=opj(path, 'ds'))
    ok_clean_git(ds.path)
    assert(exists(opj(ds.path, 'after1.txt')))
    assert(exists(opj(ds.path, 'after2.txt')))
    # cleanup
    cfg.unset(
        'datalad.create.run-after',
        where='global')
    assert_not_in('datalad.create.run-after', cfg)
Ejemplo n.º 2
0
 def  _wrap_with_store_insteadof(*args, **kwargs):
     host = args[0]
     base_path = args[1]
     try:
         dl_cfg.set('url.ria+{prot}://{host}{path}.insteadOf'
                    ''.format(prot='ssh' if host else 'file',
                              host=host if host else '',
                              path=base_path),
                    'ria+ssh://test-store:', where='global', reload=True)
         return func(*args, **kwargs)
     finally:
         dl_cfg.unset('url.ria+{prot}://{host}{path}.insteadOf'
                      ''.format(prot='ssh' if host else 'file',
                                host=host if host else '',
                                path=base_path),
                      where='global', reload=True)
Ejemplo n.º 3
0
def _test_create_store(host, ds_path, base_path, clone_path):

    # TODO: This is an issue. We are writing to ~/.gitconfig here. Override
    #       doesn't work, since RIARemote itself (actually git-annex!) doesn't
    #       have access to it, so initremote will still fail.
    #       => at least move cfg.set/unset into a decorator, so it doesn't
    #       remain when a test is failing.
    # TODO this should be wrapped in a decorator that performs the set/unset
    # in a try-finally configuration
    cfg.set('url.ria+{prot}://{host}{path}.insteadOf'
            ''.format(prot='ssh' if host else 'file',
                      host=host if host else '',
                      path=base_path),
            'ria+ssh://test-store:',
            where='global')

    ds = Dataset(ds_path).create(force=True)
    subds = ds.create('sub', force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    # don't specify special remote. By default should be git-remote + "-ria"
    res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore")
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_(len(res), 1)

    # remotes exist, but only in super
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'here'}, {s['name'] for s in sub_siblings})

    # TODO: post-update hook was enabled

    # implicit test of success by ria-installing from store:
    ds.publish(to="datastore", transfer_data='all')
    with chpwd(clone_path):
        if host:
            # note, we are not using the "test-store"-label here
            clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id),
                  path='test_install')
        else:
            # TODO: Whenever ria+file supports special remote config (label),
            # change here:
            clone('ria+file://{}#{}'.format(base_path, ds.id),
                  path='test_install')
        installed_ds = Dataset(op.join(clone_path, 'test_install'))
        assert installed_ds.is_installed()
        assert_repo_status(installed_ds.repo)
        eq_(installed_ds.id, ds.id)
        assert_in(op.join('ds', 'file1.txt'),
                  installed_ds.repo.get_annexed_files())
        assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')),
                            1,
                            status='ok',
                            action='get',
                            path=op.join(installed_ds.path, 'ds', 'file1.txt'))

    # now, again but recursive.
    res = ds.create_sibling_ria("ria+ssh://test-store:",
                                "datastore",
                                recursive=True,
                                existing='replace')
    eq_(len(res), 2)
    assert_result_count(res, 2, status='ok', action="create-sibling-ria")

    # remotes now exist in super and sub
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings})
    sub_siblings = subds.siblings(result_renderer=None)
    eq_({'datastore', 'datastore-ria', 'here'},
        {s['name']
         for s in sub_siblings})

    cfg.unset('url.ria+{prot}://{host}{path}.insteadOf'
              ''.format(prot='ssh' if host else 'file',
                        host=host if host else '',
                        path=base_path),
              where='global',
              reload=True)
Ejemplo n.º 4
0
    def __call__(path=None, is_pipeline=False, is_template=False,
                 recursive=False, chdir=None):  # dry_run=False,
        dry_run = False

        from datalad_crawler.pipeline import (
            load_pipeline_from_config, load_pipeline_from_module,
            get_repo_pipeline_config_path, get_repo_pipeline_script_path
        )
        from datalad_crawler.pipeline import run_pipeline
        from datalad.utils import chpwd  # import late so we could mock during tests

        with chpwd(chdir):

            assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both"
            if is_template:
                # generate a config and overload path with its filename
                path = initiate_pipeline_config(template=path,  # kwargs=TODO,
                                                commit=True)

            # TODO: centralize via _params_ handling
            if dry_run:
                dryrun_optlabel = 'datalad.crawl.dryrun'
                if dryrun_optlabel in cfg:
                    cfg.unset(dryrun_optlabel, where='local', reload=False)
                cfg.add(dryrun_optlabel, "True", where='local')

            if path is None:

                # get config from the current repository/dataset
                if is_pipeline:
                    raise ValueError("You must specify the file if --pipeline")

                # Let's see if there is a config or pipeline in this repo
                path = get_repo_pipeline_config_path()
                if not path or not exists(path):
                    # Check if there may be the pipeline provided
                    path = get_repo_pipeline_script_path()
                    if path and exists(path):
                        is_pipeline = True

            stats = ActivityStats()

            if not path:
                raise RuntimeError("Cannot locate crawler config or pipeline file")

            if is_pipeline:
                lgr.info("Loading pipeline definition from %s" % path)
                pipeline = load_pipeline_from_module(path)
            else:
                lgr.info("Loading pipeline specification from %s" % path)
                pipeline = load_pipeline_from_config(path)

            lgr.info("Running pipeline %s" % str(pipeline))
            # TODO: capture the state of all branches so in case of crash
            # we could gracefully reset back
            try:
                output = run_pipeline(pipeline, stats=stats)
            except Exception as exc:
                # TODO: config.crawl.failure = full-reset | last-good-master
                # probably ask via ui which action should be performed unless
                # explicitly specified
                raise
            stats.datasets_crawled += 1

            # TODO:  Move gc/clean over here!

            stats_total = stats.get_total()

            if recursive:
                # get all subdatasets, and crawl them too!
                ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path
                import os
                from datalad.distribution.dataset import Dataset
                from datalad.api import crawl
                from datalad.utils import swallow_logs
                from datalad.dochelpers import exc_str
                # Note: we could collect all datasets to be crawled here or pass recursive=True
                # into the subdatasets' crawl.  We will collect all of them here so we might later
                # also introduce automatic commits when super-dataset got successfully updated
                subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths')

                lgr.info("Crawling %d subdatasets", len(subdatasets))
                output = [output]
                # TODO: parallelize
                # TODO: assumes that all sub-datasets are 'crawllable', and if not
                # just adds them to crawl_failed count.  But may be we should make it more
                # explicit, that some sub-datasets might not need to be crawled, so they get
                # skipped explicitly?
                for ds_ in subdatasets:
                    ds_logfile = utils.get_logfilename(ds_, 'crawl')
                    try:
                        # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth
                        with swallow_logs(file_=ds_logfile) as cml:
                            output_, stats_ = crawl(chdir=ds_)
                            stats_total += stats_
                            output.append(output_)
                        lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile)
                    except Exception as exc:
                        stats_total.datasets_crawl_failed += 1
                        stats_total.datasets_crawled += 1
                        output += [None]
                        lgr.warning("Crawling of %s has failed (more in %s): %s.",  # Log output: %s",
                                    ds_, ds_logfile, exc_str(exc))  # , cml.out)

            lgr.info("Total stats: %s", stats_total.as_str(mode='line'))

            return output, stats_total