Example #1
0
def test_crawl_s3(path):
    annex = _annex(path)
    # For now a very simple one which doesn't give a damn about files being removed
    # so we just get the "most recent existed" view of all of them without having commits
    # for previous versions but  annex  processing them thus doing all house-keeping
    # necessary
    pipeline = [
        [
            crawl_s3('datalad-test0-versioned', strategy='naive', repo=annex.repo),
            annex
        ],
        annex.finalize()
    ]

    with externals_use_cassette('test_crawl_s3-pipeline1'):
        out = run_pipeline(pipeline)
    # things are committed and thus stats are empty
    eq_(out, [{'datalad_stats': ActivityStats()}])
    total_stats = out[0]['datalad_stats'].get_total()
    eq_(set(total_stats.versions), {target_version})  # we have a bunch of them since not uniq'ing them and they are all the same
    total_stats.versions = []
    eq_(total_stats, ActivityStats(files=14, overwritten=5, downloaded=14, urls=14, add_annex=14, downloaded_size=112))

    # if we rerun -- nothing new should have been done.  I.e. it is the
    # and ATM we can reuse the same cassette
    with externals_use_cassette('test_crawl_s3-pipeline1'):
        out = run_pipeline(pipeline)
    eq_(out, [{'datalad_stats': ActivityStats(skipped=17)}])
    eq_(out[0]['datalad_stats'].get_total(), ActivityStats(skipped=17))
Example #2
0
def test_crawl_s3_file_to_directory(path):
    annex = _annex(path)

    # with auto_finalize (default), Annexificator will finalize whenever it runs into a conflict
    pipeline = [
        crawl_s3('datalad-test1-dirs-versioned', repo=annex.repo, recursive=True),
    #    annex
        switch('datalad_action',
               {
                   'commit': annex.finalize(tag=True),
                   'remove': annex.remove,
                   'annex':  annex,
               })
    ]
    with externals_use_cassette('test_crawl_s3_file_to_directory-pipeline1'):
        with swallow_logs() as cml:
            out = run_pipeline(pipeline)
    assert(annex.repo.dirty)
    list(annex.finalize()(out[0]))
    # things are committed and thus stats are empty
    eq_(out, [{'datalad_stats': ActivityStats()}])
    total_stats_all = total_stats = out[0]['datalad_stats'].get_total()
    eq_(total_stats,
        # Deletions come as 'files' as well atm
        ActivityStats(files=3, downloaded=3, overwritten=2, urls=3, add_annex=3, downloaded_size=12, versions=['0.0.20160303']))
Example #3
0
def test_crawl_api_chdir(run_pipeline_, load_pipeline_from_config_, chpwd_):
    output, stats = crawl('some_path_not_checked', chdir='somedir')
    assert_equal(
        stats,
        ActivityStats(datasets_crawled=1))  # nothing was done but we got it
    assert_equal(output, None)

    chpwd_.assert_called_with('somedir')
    load_pipeline_from_config_.assert_called_with('some_path_not_checked')
    run_pipeline_.assert_called_with(['pipeline'],
                                     stats=ActivityStats(datasets_crawled=1))
Example #4
0
def test_add_archive_content_tar(repo_path):
    mode = 'full'
    special_remotes = [DATALAD_SPECIAL_REMOTE, ARCHIVES_SPECIAL_REMOTE]
    annex = Annexificator(path=repo_path,
                          allow_dirty=True,
                          mode=mode,
                          special_remotes=special_remotes,
                          largefiles="exclude=*.txt and exclude=SOMEOTHER")
    output_add = list(annex({'filename': '1.tar'}))  # adding it to annex
    assert_equal(output_add, [{'filename': '1.tar'}])

    if external_versions['cmd:annex'] >= '6.20170208':
        # should have fixed remotes
        from datalad.consts import DATALAD_SPECIAL_REMOTES_UUIDS
        for remote in special_remotes:
            eq_(annex.repo.get_description(uuid=DATALAD_SPECIAL_REMOTES_UUIDS[remote]),
                '[%s]' % remote)

    #stats = ActivityStats()
    #output_add[0]['datalad_stats'] = ActivityStats()
    output_addarchive = list(
        annex.add_archive_content(
            existing='archive-suffix',
            delete=True,
            strip_leading_dirs=True,)(output_add[0]))
    assert_equal(output_addarchive,
                 [{'datalad_stats': ActivityStats(add_annex=1, add_git=1, files=3, renamed=2),
                   'filename': '1.tar'}])
    assert_true(annex.repo.dirty)
    annex.repo.commit("added")
    ok_file_under_git(annex.repo.path, 'file.txt', annexed=False)
    ok_file_under_git(annex.repo.path, '1.dat', annexed=True)
    assert_false(lexists(opj(repo_path, '1.tar')))
    assert_false(annex.repo.dirty)
Example #5
0
def test_crawl_api_recursive(get_subdatasets_, run_pipeline_,
                             load_pipeline_from_config_,
                             get_repo_pipeline_script_path_, get_lofilename_,
                             chpwd_, tdir):
    pwd = getpwd()
    with chpwd(tdir):
        output, stats = crawl(recursive=True)
    assert_equal(pwd, getpwd())
    if external_versions['mock'] < '1.0.1':
        raise SkipTest(
            "needs a more recent mock which throws exceptions in side_effects")
    assert_equal(output,
                 [[]] * 4 + [None])  # for now output is just a list of outputs
    assert_equal(
        stats, ActivityStats(
            datasets_crawled=5,
            datasets_crawl_failed=1))  # nothing was done but we got it crawled
    chpwd_.assert_has_calls([
        call(None),
        call('path1'),
        call('path1/path1_1'),
        call('path2'),
    ],
                            any_order=True)
    assert_equal(
        list(find_files('.*', tdir, exclude_vcs=False)),
        [_path_(tdir, 'some.log')])  # no files were generated besides the log
Example #6
0
def test_crawl_s3_commit_versions(path):
    annex = _annex(path)

    # Fancier setup so we could do any of desired actions within a single sweep
    pipeline = [
        crawl_s3('datalad-test0-versioned', strategy='commit-versions', repo=annex.repo),
        switch('datalad_action',
               {
                   'commit': annex.finalize(tag=True),
                   'remove': annex.remove,
                   'annex':  annex,
               })
    ]

    with externals_use_cassette('test_crawl_s3-pipeline1'):
        with swallow_logs(new_level=logging.WARN) as cml:
            out = run_pipeline(pipeline)
            assert_in("There is already a tag %s" % target_version, cml.out)
    # things are committed and thus stats are empty
    eq_(out, [{'datalad_stats': ActivityStats()}])
    total_stats = out[0]['datalad_stats'].get_total()

    eq_(set(total_stats.versions), {target_version})  # we have a bunch of them since not uniq'ing them and they are all the same
    # override for easier checking
    total_stats.versions = []
    eq_(total_stats,
        # Deletions come as 'files' as well atm
        ActivityStats(files=17, overwritten=3, downloaded=14, urls=14, add_annex=14, removed=3, downloaded_size=112))
    tags = annex.repo.get_tags(output='name')
    assert_in(target_version, tags)
    # and we actually got 7 more commits
    for t in range(1, 8):
        assert_in(target_version + "+%d" % t, tags)

    # if we rerun -- nothing new should have been done.  I.e. it is the
    # and ATM we can reuse the same cassette
    with externals_use_cassette('test_crawl_s3-pipeline1'):
        with swallow_logs() as cml:
            out = run_pipeline(pipeline)
            assert_not_in("There is already a tag %s" % target_version, cml.out)
    eq_(out, [{'datalad_stats': ActivityStats(skipped=17)}])
    eq_(out[0]['datalad_stats'].get_total(), ActivityStats(skipped=17))  # Really nothing was done
Example #7
0
def test_crawl_s3_commit_versions_one_at_a_time(path):
    annex = _annex(path)

    # Fancier setup so we could do any of desired actions within a single sweep
    pipeline = [
        crawl_s3('datalad-test0-versioned', strategy='commit-versions', repo=annex.repo, ncommits=1),
        switch('datalad_action',
               {
                   'commit': annex.finalize(tag=True),
                   'remove': annex.remove,
                   'annex':  annex,
               })
    ]

    with externals_use_cassette('test_crawl_s3-pipeline1'):
        with swallow_logs(new_level=logging.WARN) as cml:
            out = run_pipeline(pipeline)
            assert_not_in("There is already a tag %s" % target_version, cml.out)
    # things are committed and thus stats are empty
    eq_(out, [{'datalad_stats': ActivityStats()}])
    total_stats_all = total_stats = out[0]['datalad_stats'].get_total()
    eq_(total_stats,
        # Deletions come as 'files' as well atm
        ActivityStats(files=3, downloaded=3, urls=3, add_annex=3, downloaded_size=24, versions=[target_version]))

    # and there should be 7 more, every time changing the total stats
    for t in range(1, 8):
        with externals_use_cassette('test_crawl_s3-pipeline1'):
            with swallow_logs(new_level=logging.WARN) as cml:
                out = run_pipeline(pipeline)
                assert_in("There is already a tag %s" % target_version, cml.out)
        total_stats_ = out[0]['datalad_stats'].get_total()
        assert_not_equal(total_stats, total_stats_)
        total_stats = total_stats_
        total_stats_all += total_stats

    # with total stats at the end to be the same as if all at once
    total_stats_all.versions = []
    eq_(total_stats_all,
        # Deletions come as 'files' as well atm
        ActivityStats(files=17, skipped=72, overwritten=3, downloaded=14, urls=14, add_annex=14, removed=3, downloaded_size=112))
Example #8
0
def test_pipeline_dropped_stats():
    def n1(data):
        data['datalad_stats'].increment('add_git')
        yield data

    def n2(data):  # doesn't care to maintain previous stats
        yield {'out': 1}

    pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2])
    eq_(pipeline_output, [{
        'datalad_stats': ActivityStats(add_git=1),
        'out': 1
    }])
Example #9
0
def test_pipeline_updated_stats():
    def n1(data):
        data['datalad_stats'].increment('add_git')
        yield data

    def n2(data):  # doesn't care to maintain previous stats
        data = data.copy()
        data['datalad_stats'] = ActivityStats(files=2)
        data['out'] = 1
        yield data

    pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2])
    eq_(pipeline_output, [{
        'datalad_stats': ActivityStats(files=2, add_git=1),
        'out': 1
    }])
Example #10
0
 def _check(version, remaining=None, unlinked=None, **kwargs):
     """Helper - for a given version and kwargs for remove_ - what to expect"""
     version_db.version = version
     data = {'datalad_stats': ActivityStats()}
     rov = annex.remove_other_versions(db=version_db, **kwargs)  # generator
     with patch('os.unlink', new_callable=Unlinker) as cmunlink, \
             patch('os.path.lexists', return_value=True),\
             patch('datalad_crawler.nodes.annex.find_files', new_callable=cmunlink.Find_files):
         out = list(rov(data))
     assert_equal(len(out), 1)
     eq_(out[0]['datalad_stats'].versions, [version])
     if remaining is not None:
         eq_(cmunlink.remaining, remaining)
     if unlinked is not None:
         eq_(cmunlink.unlinked, unlinked)
     eq_(cmunlink.remaining.union(cmunlink.unlinked), cmunlink.allfiles)
Example #11
0
def test_pipeline_stats_persist():
    # to test that we would get proper stats returned in various pipeline layouts
    def n1(data):
        data['datalad_stats'].increment('add_git')
        yield data

    def p(data):
        yield data

    def n2(data):  # doesn't care to maintain previous stats
        data['datalad_stats'].increment('add_annex')
        yield data

    target_stats = ActivityStats(add_git=1, add_annex=1)

    def assert_pipeline(pipeline):
        eq_(run_pipeline(pipeline), [{'datalad_stats': target_stats}])

    assert_pipeline([n1, n2])
    assert_pipeline([n1, [n2]])
    assert_pipeline([[n1], [n2]])
    assert_pipeline([n1, [n2, p]])
    assert_pipeline([[n1], n2])
    assert_pipeline([[n1, p], n2])
Example #12
0
def xrun_pipeline(pipeline, data=None, stats=None, reset=True):
    """Yield results from the pipeline.

    """
    id_pipeline = "Pipe #%s" % id(pipeline)

    def _log(msg, *args):
        """Helper for uniform debug messages"""
        lgr.log(5, "%s: " + msg, id_pipeline, *args)

    _log("%s", pipeline)

    if reset:
        _log("Resetting pipeline")
        reset_pipeline(pipeline)

    # just for paranoids and PEP8-disturbed, since theoretically every node
    # should not change the data, so having default {} should be sufficient
    data = data or {}

    if 'datalad_stats' in data:
        if stats is not None:
            raise ValueError(
                "We were provided stats to use, but data has already datalad_stats"
            )
    else:
        data = updated(data, {'datalad_stats': stats or ActivityStats()})

    if not len(pipeline):
        return

    # options for this pipeline
    opts, pipeline = _get_pipeline_opts(pipeline)

    # verify that we know about all specified options
    unknown_opts = set(opts).difference(set(PIPELINE_OPTS))
    if unknown_opts:
        raise ValueError("Unknown pipeline options %s" % str(unknown_opts))

    data_to_process = [data]
    output = opts['output']
    if output not in ('input', 'last-output', 'outputs', 'input+outputs'):
        raise ValueError("Unknown output=%r" % output)

    if opts['loop'] and output == 'input':
        lgr.debug(
            "Assigning output='last-output' for sub-pipeline since we want "
            "to loop until pipeline returns anything")
        output_sub = 'last-output'
    else:
        output_sub = output

    log_level = lgr.getEffectiveLevel()
    data_out = None
    while data_to_process:
        _log("processing data. %d left to go", len(data_to_process))
        data_in = data_to_process.pop(0)
        try:
            for idata_out, data_out in enumerate(
                    xrun_pipeline_steps(pipeline, data_in, output=output_sub)):
                if log_level <= 3:
                    # provide details of what keys got changed
                    # TODO: unify with 2nd place where it was invoked
                    lgr.log(3, "O3: +%s, -%s, ch%s, ch?%s",
                            *_compare_dicts(data_in, data_out))

                _log("got new %dth output", idata_out)
                if opts['loop']:
                    _log(
                        "extending list of data to process due to loop option")
                    data_to_process.append(data_out)
                if 'outputs' in output:
                    _log("yielding output")
                    yield data_out
        except FinishPipeline as e:
            # TODO: decide what we would like to do -- skip that particular pipeline run
            # or all subsequent or may be go back and only skip that generated result
            _log("got a signal that pipeline is 'finished'")

    # TODO: this implementation is somewhat bad since all the output logic is
    # duplicated within xrun_pipeline_steps, but it is probably unavoidable because of
    # loop option
    if output == 'last-output':
        if data_out:
            _log("yielding last-output")
            yield data_out

    # Input should be yielded last since otherwise it might ruin the flow for typical
    # pipelines which do not expect anything beyond going step by step
    # We should yield input data even if it was empty
    if 'input' in output:
        _log("finally yielding input data as instructed")
        yield data
Example #13
0
def test_remove_other_versions(repo_path):
    annex = Annexificator(path=repo_path, create=True)

    class version_db:
        version = '1.0.0'
        versions = OrderedDict([
            ('10.0.0', {}),  # wrong order -- will fail and we will pop it
            ('1.0.0', {
                'a': 'a_1.0.0'
            }),
            ('2.0.0', {
                'b': 'b_2.0.0',
                'c': 'c_2.0.0'
            }),
            ('2.0.1', {
                'c': 'c_2.0.1'
            }),  # if no overlay, b is gone, overlay=2 should keep b
            ('2.1.1', {
                'c': 'c_2.1.1'
            }),  # b should disappear if overlay=2, but stay if
            ('3', {
                'd': 'd_3'
            }),
        ])

    assert_raises(ValueError,
                  annex.remove_other_versions,
                  'name',
                  db=version_db)

    data = {'datalad_stats': ActivityStats()}
    rov = annex.remove_other_versions(db=version_db)  # generator

    # we have that incorrectly ordered version inside
    assert_raises(AssertionError, next, rov(data))
    version_db.versions.pop('10.0.0')  # remove the abuser

    class Unlinker(object):
        def __init__(self):
            self.unlinked = []
            self.allfiles = set()
            # Let's also record all present files
            for vfs in version_db.versions.values():
                self.allfiles.update(vfs.values())
            self._remaining = self.allfiles.copy()

        def Find_files(self):
            """To provide another mock callable"""
            def find_files(*args, **kwargs):
                assert_equal(kwargs.get('topdir'), repo_path)
                # return full path
                return [opj(repo_path, x) for x in self._remaining]

            return find_files

        def __call__(self, s):
            bs = basename(s)
            assert (bs in self._remaining)
            self.unlinked.append(bs)
            self._remaining.remove(bs)

        @property
        def remaining(self):
            # strip the repopath
            return self._remaining

    def _check(version, remaining=None, unlinked=None, **kwargs):
        """Helper - for a given version and kwargs for remove_ - what to expect"""
        version_db.version = version
        data = {'datalad_stats': ActivityStats()}
        rov = annex.remove_other_versions(db=version_db, **kwargs)  # generator
        with patch('os.unlink', new_callable=Unlinker) as cmunlink, \
                patch('os.path.lexists', return_value=True),\
                patch('datalad_crawler.nodes.annex.find_files', new_callable=cmunlink.Find_files):
            out = list(rov(data))
        assert_equal(len(out), 1)
        eq_(out[0]['datalad_stats'].versions, [version])
        if remaining is not None:
            eq_(cmunlink.remaining, remaining)
        if unlinked is not None:
            eq_(cmunlink.unlinked, unlinked)
        eq_(cmunlink.remaining.union(cmunlink.unlinked), cmunlink.allfiles)

    def check(*args, **kwargs):
        _check(*args, **kwargs)
        kwargs = kwargs.copy()  # without remove unversioned
        # in our test results should be the same
        kwargs['remove_unversioned'] = True
        _check(*args, **kwargs)  # remove_unversioned=True

    check('1.0.0', remaining={'a_1.0.0'})
    # even though due to overlay=0 all versions are identical, there were no
    # version before 1.0.0, and all later are removed regardless or overlay
    check('1.0.0', remaining={'a_1.0.0'}, overlay=0)

    check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'})
    check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'}, overlay=1)
    check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'}, overlay=lambda x: x[:1])
    # but with overlay=0 we would also get files from 1
    check('2.0.0', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.0'}, overlay=0)

    check('2.0.1', remaining={'c_2.0.1'})
    check('2.0.1', remaining={'b_2.0.0', 'c_2.0.1'}, overlay=1)
    check('2.0.1', remaining={'b_2.0.0', 'c_2.0.1'}, overlay=2)
    check('2.0.1', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.1'}, overlay=0)

    check('2.1.1', remaining={'c_2.1.1'})
    check('2.1.1', remaining={'c_2.1.1'}, overlay=2)
    check('2.1.1', remaining={'b_2.0.0', 'c_2.1.1'}, overlay=1)
    check('2.1.1', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.1.1'}, overlay=0)

    check('3', remaining={'d_3'})
    check('3', remaining={'d_3'}, overlay=1)
    check('3', remaining={'d_3'}, overlay=2)
    check('3', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.1.1', 'd_3'}, overlay=0)

    # if name changes
    version_db.versions['2.1.1'] = {'c00.dat': 'c00_2.1.1'}
    # if we don't do anything special, it would have its own life:
    check('2.1.1', remaining={'c00_2.1.1'})
    check('2.1.1', remaining={'c00_2.1.1'}, overlay=2)
    check('2.1.1', remaining={'b_2.0.0', 'c_2.0.1', 'c00_2.1.1'}, overlay=1)
    check('2.1.1',
          remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.1', 'c00_2.1.1'},
          overlay=0)
    # but we should be able to specify to "unify" unversioned name more by providing
    # replacement pattern
    kw = dict(fpath_subs=[('c00', 'c'), ('\.dat', '')])
    check('2.1.1', remaining={'b_2.0.0', 'c00_2.1.1'}, overlay=1, **kw)
    check('2.1.1',
          remaining={'a_1.0.0', 'b_2.0.0', 'c00_2.1.1'},
          overlay=0,
          **kw)
Example #14
0
def _test_annex_file(mode, topdir, topurl, outdir):
    annex = Annexificator(path=outdir,
                          mode=mode,
                          statusdb='fileattr',
                          largefiles="exclude=*.txt")

    input = {'url': "%sd1/1.dat" % topurl, 'filename': '1-copy.dat'}
    tfile = opj(outdir, '1-copy.dat')
    # we add full filepath now
    expected_output = [dict(filepath=opj(outdir, input['filename']), **input)]
    output = list(annex(input))
    assert_equal(expected_output, output)

    # addurl is batched, and we haven't forced annex flushing so there should
    # be a batched process
    if not annex.repo.fake_dates_enabled:
        assert_equal(len(annex.repo._batched), 1)
    # if we finalize, it should flush batched annexes and commit
    list(annex.finalize()({}))
    assert (lexists(tfile))

    ok_file_under_git(tfile, annexed=True)
    if mode == 'full':
        ok_file_has_content(tfile, '1.dat load')
    else:
        # in fast or relaxed mode there must not be any content
        assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load')

    whereis = annex.repo.whereis(tfile)
    assert_in(annex.repo.WEB_UUID, whereis)  # url must have been added
    assert_equal(len(whereis), 1 + int(mode == 'full'))
    # TODO: check the url
    # Neither file should not be attempted to download again, since nothing changed
    # and by default we do use files db
    output = list(annex(input))
    assert_equal(output, [])  # nothing was done, so annex didn't yield data
    annex.yield_non_updated = True

    input_with_stats = input.copy()
    input_with_stats['datalad_stats'] = ActivityStats()
    output = list(annex(input_with_stats))
    assert_equal(output[0]['datalad_stats'],
                 ActivityStats(files=1, urls=1, skipped=1))

    # but if we change that file, it should re-download it now
    with open(opj(topdir, 'd1', '1.dat'), 'a') as f:
        f.write("+")
    output = list(annex(input_with_stats))
    stats = output[0]['datalad_stats']
    stats.downloaded_time = 0
    # 2 since we are reusing the same stats
    download_stats = dict(downloaded=1,
                          downloaded_size=11) if mode == 'full' else {}
    addskip_stats = dict(add_annex=0, skipped=2,
                         overwritten=0) if mode == 'relaxed' else dict(
                             add_annex=1, skipped=1, overwritten=1)
    kwargs = download_stats.copy()
    kwargs.update(addskip_stats)
    assert_equal(stats, ActivityStats(files=2, urls=2, **kwargs))

    # Download into a file which will be added to git
    # TODO: for now added to git only in full mode. in --fast or --relaxed, still goes to annex
    # http://git-annex.branchable.com/bugs/treatment_of_largefiles_is_not_working_for_addurl_--fast___40__or_--relaxed__41__/
    input = {
        'url': "%sd1/1.dat" % topurl,
        'filename': '1.txt',
        'datalad_stats': ActivityStats()
    }
    tfile = opj(outdir, '1.txt')
    output = list(annex(input))
    annexed = mode not in {'full'}
    list(annex.finalize()({}))
    if not annexed:
        ok_file_has_content(tfile, '1.dat load+')
    else:
        assert_raises(AssertionError, ok_file_has_content, tfile,
                      '1.dat load+')
    ok_file_under_git(tfile, annexed=annexed)
    assert_equal(len(output), 1)
    stats = output[0]['datalad_stats']
    # reset varying metric
    stats.downloaded_time = 0
    assert_equal(
        stats,
        ActivityStats(files=1,
                      urls=1,
                      add_git=1 - int(annexed),
                      add_annex=int(annexed),
                      **download_stats))

    # Let's add a file without specifying URL
    sfilepath = opj(outdir, 'sample.txt')
    with open(sfilepath, 'w') as f:
        f.write("sample")
    ok_file_has_content(sfilepath, "sample")
    output = list(
        annex({
            'filename': 'sample.txt',
            'datalad_stats': ActivityStats()
        }))
    ok_file_under_git(sfilepath, annexed=False)
    assert (output)
    assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, add_git=1))
Example #15
0
 def n2(data):  # doesn't care to maintain previous stats
     data = data.copy()
     data['datalad_stats'] = ActivityStats(files=2)
     data['out'] = 1
     yield data
Example #16
0
def test_openfmri_pipeline2(ind, topurl, outd):
    # no versioned files -- should still work! ;)

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # backend set, dataset init, crawler, init, incoming (shares with master -1),
    #   (2 or 3 commits, depending on create variant)
    # incoming-processed, merge, aggregate metadata:
    ncommits_master = len(commits_hexsha['master'])
    assert_in(ncommits_master, [5, 6])
    assert_in(len(commits_l['master']), [4, 5])

    eq_(len(commits_hexsha['incoming']), ncommits_master - 2)
    eq_(len(commits_l['incoming']), ncommits_master - 2)
    eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 1)
    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 2)

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'),
              opj(ind, 'ds666', 'ds666.tar.gz'))

    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)
    eq_(out[0]['datalad_stats'], ActivityStats())  # was committed
    stats_total = out[0]['datalad_stats'].get_total()
    stats_total.downloaded_size = 0
    eq_(
        stats_total,
        ActivityStats(files=4,
                      overwritten=1,
                      skipped=1,
                      downloaded=1,
                      merges=[['incoming', 'incoming-processed']],
                      versions=['1.0.0'],
                      renamed=1,
                      urls=2,
                      add_annex=2))
    # in reality there is also 1.0.0+1 tag since file changed but no version suffix
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1'])

    check_dropall_get(repo)
Example #17
0
def test_openfmri_pipeline1(ind, topurl, outd, clonedir):
    index_html = opj(ind, 'ds666', 'index.html')

    list(
        initiate_dataset(template="openfmri",
                         dataset_name='dataladtest-ds666',
                         path=outd,
                         data_fields=['dataset'])({
                             'dataset': 'ds666'
                         }))

    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Since datalad 0.11.2 all .metadata/objects go under annex.
    # Here we have a test where we force drop all annexed content,
    # to mitigate that let's place all metadata under git
    dotdatalad_attributes_file = opj('.datalad', '.gitattributes')
    repo.set_gitattributes([('metadata/objects/**', {
        'annex.largefiles': '(nothing)'
    })], dotdatalad_attributes_file)
    # --amend so we do not cause change in # of commits below
    repo.commit("gitattributes",
                files=dotdatalad_attributes_file,
                options=['--amend'])

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath_nover, "mighty load in old format")

    #
    # And now versioned files were specified!
    #
    add_to_index(index_html, content=_versioned_files)

    with chpwd(outd):
        pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl)
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    ok_(
        not exists(t1w_fpath_nover),
        "%s file should no longer be there if unversioned files get removed correctly"
        % t1w_fpath_nover)
    repo = AnnexRepo(outd, create=False)  # to be used in the checks
    # Inspect the tree -- that we have all the branches
    branches = {'master', 'incoming', 'incoming-processed', 'git-annex'}
    eq_(set(repo.get_branches()), branches)
    # We do not have custom changes in master yet, so it just follows incoming-processed atm
    # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed'))
    # Since we did initiate_dataset -- now we have separate master!
    assert_not_equal(repo.get_hexsha('master'),
                     repo.get_hexsha('incoming-processed'))
    # and that one is different from incoming
    assert_not_equal(repo.get_hexsha('incoming'),
                     repo.get_hexsha('incoming-processed'))

    # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming
    # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents

    commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    # all commits out there:
    # dataset init, crawler init
    #   (2 commits)
    # + 3*(incoming, processed, merge)
    # + 3*aggregate-metadata update
    #   - 1 since now that incoming starts with master, there is one less merge
    # In --incremental mode there is a side effect of absent now
    #   2*remove of obsolete metadata object files,
    #     see https://github.com/datalad/datalad/issues/2772
    # TODO inspect by knowledgeable person and re-enable
    #ncommits_master = len(commits_hexsha['master'])
    #assert_in(ncommits_master, [13, 14])
    #assert_in(len(commits_l['master']), [8, 9])

    # TODO inspect by knowledgeable person and re-enable
    #eq_(len(commits_hexsha['incoming']), ncommits_master - 8)
    #eq_(len(commits_l['incoming']), ncommits_master - 8)
    #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5)
    #eq_(len(commits_l['incoming-processed']), ncommits_master - 8)

    # Check tags for the versions
    eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1'])
    # +1 because original "release" was assumed to be 1.0.0
    repo_tags = repo.get_tags()
    eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1'])

    # Ben: The tagged ones currently are the ones with the message
    # '[DATALAD] dataset aggregate metadata update\n':
    #eq_(repo_tags[0]['hexsha'], commits_l['master'][4])  # next to the last one
    #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0])  # the last one

    def hexsha(l):
        return l.__class__(x.hexsha for x in l)

    # TODO requires additional tooling to re-enable
    ## Verify that we have desired tree of merges
    #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1],
    #                                                         commits_l['incoming'][0]))
    #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3],  # also in master
    #                                                         commits_l['incoming'][2],))

    # ben: The following two comparisons are targeting these commits:
    # commit "Merge branch 'incoming-processed'\n" in commits_l['master'],
    # parents are:
    # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and
    # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed']
    # TODO requires additional tooling to re-enable
    #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2],
    #                                             commits_l['incoming-processed'][0]))
    #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4],
    #                                             commits_l['incoming-processed'][1]))

    with chpwd(outd):
        eq_(set(glob('*')), {'changelog.txt', 'sub-1'})
        all_files = sorted(find_files('.'))

    t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat')
    ok_file_has_content(t1w_fpath, "mighty load 1.0.1")
    ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False)
    ok_file_under_git(t1w_fpath, annexed=True)

    try:
        # this is the new way
        from datalad.metadata.metadata import get_ds_aggregate_db_locations
        ds = Dataset('.')
        dbloc, objbase = get_ds_aggregate_db_locations(ds)
        dbloc = op.relpath(dbloc, start=ds.path)
    except ImportError:
        # this stopped working in early 2019 versions of datalad
        from datalad.metadata.metadata import agginfo_relpath
        dbloc = agginfo_relpath

    target_files = {
        './.datalad/config',
        './.datalad/crawl/crawl.cfg',
        # no more!
        # './.datalad/config.ttl', './.datalad/datalad.ttl',
        './.datalad/crawl/statuses/incoming.json',
        './.datalad/crawl/versions/incoming.json',
        './changelog.txt',
        './sub-1/anat/sub-1_T1w.dat',
        './sub-1/beh/responses.tsv',
        './' + dbloc,
    }
    target_incoming_files = {
        '.gitattributes',  # we marked default backend right in the incoming
        # we now base 'incoming' on master branch, so we get all those as well
        '.datalad/.gitattributes',
        '.datalad/config',
        '.datalad/crawl/crawl.cfg',
        'changelog.txt',
        'ds666.tar.gz',
        'ds666-beh_R1.0.1.tar.gz',
        'ds666_R1.0.0.tar.gz',
        'ds666_R1.0.1.tar.gz',
        'ds666_R2.0.0.tar.gz',
        '.datalad/crawl/statuses/incoming.json',
        '.datalad/crawl/versions/incoming.json'
    }
    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # check that -beh was committed in 2nd commit in incoming, not the first one
    assert_not_in('ds666-beh_R1.0.1.tar.gz',
                  repo.get_files(commits_l['incoming'][-1]))
    assert_in('ds666-beh_R1.0.1.tar.gz',
              repo.get_files(commits_l['incoming'][0]))

    # rerun pipeline -- make sure we are on the same in all branches!
    with chpwd(outd):
        out = run_pipeline(pipeline)
    eq_(len(out), 1)

    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    eq_(commits_hexsha, commits_hexsha_)  # i.e. nothing new
    # actually we do manage to add_git 1 (README) since it is generated committed directly to git
    # BUT now fixed -- if not committed (was the same), should be marked as skipped
    # Nothing was committed so stats leaked all the way up
    eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5))
    eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total())

    # rerun pipeline when new content is available
    # add new revision, rerun pipeline and check that stuff was processed/added correctly
    add_to_index(
        index_html,
        content=
        '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>')

    with chpwd(outd):
        out = run_pipeline(pipeline)
        all_files_updated = sorted(find_files('.'))
    eq_(len(out), 1)
    assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats())
    # there is no overlays ATM, so behav would be gone since no 2.0.0 for it!
    target_files.remove('./sub-1/beh/responses.tsv')

    # Ben: metadata object files may differ in their names containing some checksum-ish shit ...
    # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison
    eq_(
        set([
            f for f in all_files_updated
            if not f.startswith('./.datalad/metadata/objects/')
        ]), target_files)

    # new instance so it re-reads git stuff etc
    # repo = AnnexRepo(outd, create=False)  # to be used in the checks
    commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches}
    commits_l_ = {
        b: list(_get_branch_commits(repo, b, limit='left-only'))
        for b in branches
    }

    assert_not_equal(commits_hexsha, commits_hexsha_)
    eq_(out[0]['datalad_stats'],
        ActivityStats())  # commit happened so stats were consumed
    # numbers seems to be right
    total_stats = out[0]['datalad_stats'].get_total()
    # but for some reason downloaded_size fluctuates.... why? probably archiving...?
    total_stats.downloaded_size = 0
    eq_(
        total_stats,
        ActivityStats(
            files=8,
            skipped=5,
            downloaded=1,
            renamed=1,
            urls=6,
            add_annex=2,  # add_git=1, # README
            versions=['2.0.0'],
            merges=[['incoming', 'incoming-processed']]))

    check_dropall_get(repo)

    # Let's see if pipeline would remove files we stopped tracking
    remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>')
    with chpwd(outd):
        with swallow_logs(new_level=logging.WARNING) as cml:
            out = run_pipeline(pipeline)
            # since files get removed in incoming, but repreprocessed completely
            # incomming-processed and merged into master -- new commits will come
            # They shouldn't have any difference but still should be new commits
            assert_in("There is already a tag 2.0.0 in the repository",
                      cml.out)
    eq_(len(out), 1)
    incoming_files = repo.get_files('incoming')
    target_incoming_files.remove('ds666_R1.0.0.tar.gz')
    eq_(set(incoming_files), target_incoming_files)
    commits_hexsha_removed = {
        b: list(_get_branch_commits(repo, b))
        for b in branches
    }
    # our 'statuses' database should have recorded the change thus got a diff
    # which propagated through all branches
    for b in 'master', 'incoming-processed':
        # with non persistent DB we had no changes
        # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), [])
        assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json',
                  repo.diff(b, commits_hexsha_[b][0]))
    dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0])
    eq_(len(dincoming),
        2)  # 2 diff objects -- 1 file removed, 1 statuses updated
    eq_(
        set(dincoming.keys()), {
            repo.pathobj / '.datalad/crawl/statuses/incoming.json',
            repo.pathobj / 'ds666_R1.0.0.tar.gz'
        })

    eq_(out[0]['datalad_stats'].get_total().removed, 1)
    assert_not_equal(commits_hexsha_, commits_hexsha_removed)

    # we will check if a clone would be crawling just as good
    from datalad.api import crawl

    # make a brand new clone
    GitRepo.clone(outd, clonedir)

    def _pipeline(*args, **kwargs):
        """Helper to mock openfmri.pipeline invocation so it looks at our 'server'"""
        kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False})
        return ofpipeline(*args, **kwargs)

    with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline):
        output, stats = crawl(
        )  # we should be able to recrawl without doing anything
        ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
Example #18
0
    def __call__(path=None, is_pipeline=False, is_template=False,
                 recursive=False, chdir=None):  # dry_run=False,
        dry_run = False

        from datalad.crawler.pipeline import (
            load_pipeline_from_config, load_pipeline_from_module,
            get_repo_pipeline_config_path, get_repo_pipeline_script_path
        )
        from datalad.crawler.pipeline import run_pipeline
        from datalad.utils import chpwd  # import late so we could mock during tests

        with chpwd(chdir):

            assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both"
            if is_template:
                # generate a config and overload path with its filename
                path = initiate_pipeline_config(template=path,  # kwargs=TODO,
                                                commit=True)

            # TODO: centralize via _params_ handling
            if dry_run:
                dryrun_optlabel = 'datalad.crawl.dryrun'
                if dryrun_optlabel in cfg:
                    cfg.unset(dryrun_optlabel, where='local', reload=False)
                cfg.add(dryrun_optlabel, "True", where='local')

            if path is None:

                # get config from the current repository/dataset
                if is_pipeline:
                    raise ValueError("You must specify the file if --pipeline")

                # Let's see if there is a config or pipeline in this repo
                path = get_repo_pipeline_config_path()
                if not path or not exists(path):
                    # Check if there may be the pipeline provided
                    path = get_repo_pipeline_script_path()
                    if path and exists(path):
                        is_pipeline = True

            stats = ActivityStats()

            if not path:
                raise RuntimeError("Cannot locate crawler config or pipeline file")

            if is_pipeline:
                lgr.info("Loading pipeline definition from %s" % path)
                pipeline = load_pipeline_from_module(path)
            else:
                lgr.info("Loading pipeline specification from %s" % path)
                pipeline = load_pipeline_from_config(path)

            lgr.info("Running pipeline %s" % str(pipeline))
            # TODO: capture the state of all branches so in case of crash
            # we could gracefully reset back
            try:
                output = run_pipeline(pipeline, stats=stats)
            except Exception as exc:
                # TODO: config.crawl.failure = full-reset | last-good-master
                # probably ask via ui which action should be performed unless
                # explicitly specified
                raise
            stats.datasets_crawled += 1

            # TODO:  Move gc/clean over here!

            stats_total = stats.get_total()

            if recursive:
                # get all subdatasets, and crawl them too!
                ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path
                import os
                from ..distribution.dataset import Dataset
                from ..api import crawl
                from ..utils import swallow_logs
                from ..dochelpers import exc_str
                # Note: we could collect all datasets to be crawled here or pass recursive=True
                # into the subdatasets' crawl.  We will collect all of them here so we might later
                # also introduce automatic commits when super-dataset got successfully updated
                subdatasets = Dataset(os.curdir).get_subdatasets(recursive=recursive)

                lgr.info("Crawling %d subdatasets", len(subdatasets))
                output = [output]
                # TODO: parallelize
                # TODO: assumes that all sub-datasets are 'crawllable', and if not
                # just adds them to crawl_failed count.  But may be we should make it more
                # explicit, that some sub-datasets might not need to be crawled, so they get
                # skipped explicitly?
                for ds_ in subdatasets:
                    ds_logfile = utils.get_logfilename(ds_, 'crawl')
                    try:
                        # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth
                        with swallow_logs(file_=ds_logfile) as cml:
                            output_, stats_ = crawl(chdir=ds_)
                            stats_total += stats_
                            output.append(output_)
                        lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile)
                    except Exception as exc:
                        stats_total.datasets_crawl_failed += 1
                        stats_total.datasets_crawled += 1
                        output += [None]
                        lgr.warning("Crawling of %s has failed (more in %s): %s.",  # Log output: %s",
                                    ds_, ds_logfile, exc_str(exc))  # , cml.out)

            lgr.info("Total stats: %s", stats_total.as_str(mode='line'))

            return output, stats_total
Example #19
0
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex
Example #20
0

@with_tree(
    tree={
        'pipeline.py': 'pipeline = lambda: [1]',
        'pipeline2.py': 'pipeline = lambda x: [2*x]',
    })
def test_load_pipeline_from_script(d):
    eq_(load_pipeline_from_module(opj(d, 'pipeline.py')), [1])
    eq_(load_pipeline_from_module(opj(d, 'pipeline2.py'), kwargs=dict(x=2)),
        [4])
    assert_raises(RuntimeError, load_pipeline_from_module,
                  opj(d, 'unlikelytobethere.py'))


DEFAULT_OUTPUT = [{'datalad_stats': ActivityStats()}]


def _out(ld):
    """Adjust output entry to include default outputs as well
    """
    outl = []
    for d in ld:
        out = d.copy()
        outl.append(out)
        for k, v in DEFAULT_OUTPUT[0].items():
            if k not in out:
                out[k] = v
    return outl