def test_crawl_s3(path): annex = _annex(path) # For now a very simple one which doesn't give a damn about files being removed # so we just get the "most recent existed" view of all of them without having commits # for previous versions but annex processing them thus doing all house-keeping # necessary pipeline = [ [ crawl_s3('datalad-test0-versioned', strategy='naive', repo=annex.repo), annex ], annex.finalize() ] with externals_use_cassette('test_crawl_s3-pipeline1'): out = run_pipeline(pipeline) # things are committed and thus stats are empty eq_(out, [{'datalad_stats': ActivityStats()}]) total_stats = out[0]['datalad_stats'].get_total() eq_(set(total_stats.versions), {target_version}) # we have a bunch of them since not uniq'ing them and they are all the same total_stats.versions = [] eq_(total_stats, ActivityStats(files=14, overwritten=5, downloaded=14, urls=14, add_annex=14, downloaded_size=112)) # if we rerun -- nothing new should have been done. I.e. it is the # and ATM we can reuse the same cassette with externals_use_cassette('test_crawl_s3-pipeline1'): out = run_pipeline(pipeline) eq_(out, [{'datalad_stats': ActivityStats(skipped=17)}]) eq_(out[0]['datalad_stats'].get_total(), ActivityStats(skipped=17))
def test_crawl_s3_file_to_directory(path): annex = _annex(path) # with auto_finalize (default), Annexificator will finalize whenever it runs into a conflict pipeline = [ crawl_s3('datalad-test1-dirs-versioned', repo=annex.repo, recursive=True), # annex switch('datalad_action', { 'commit': annex.finalize(tag=True), 'remove': annex.remove, 'annex': annex, }) ] with externals_use_cassette('test_crawl_s3_file_to_directory-pipeline1'): with swallow_logs() as cml: out = run_pipeline(pipeline) assert(annex.repo.dirty) list(annex.finalize()(out[0])) # things are committed and thus stats are empty eq_(out, [{'datalad_stats': ActivityStats()}]) total_stats_all = total_stats = out[0]['datalad_stats'].get_total() eq_(total_stats, # Deletions come as 'files' as well atm ActivityStats(files=3, downloaded=3, overwritten=2, urls=3, add_annex=3, downloaded_size=12, versions=['0.0.20160303']))
def test_crawl_api_chdir(run_pipeline_, load_pipeline_from_config_, chpwd_): output, stats = crawl('some_path_not_checked', chdir='somedir') assert_equal( stats, ActivityStats(datasets_crawled=1)) # nothing was done but we got it assert_equal(output, None) chpwd_.assert_called_with('somedir') load_pipeline_from_config_.assert_called_with('some_path_not_checked') run_pipeline_.assert_called_with(['pipeline'], stats=ActivityStats(datasets_crawled=1))
def test_add_archive_content_tar(repo_path): mode = 'full' special_remotes = [DATALAD_SPECIAL_REMOTE, ARCHIVES_SPECIAL_REMOTE] annex = Annexificator(path=repo_path, allow_dirty=True, mode=mode, special_remotes=special_remotes, largefiles="exclude=*.txt and exclude=SOMEOTHER") output_add = list(annex({'filename': '1.tar'})) # adding it to annex assert_equal(output_add, [{'filename': '1.tar'}]) if external_versions['cmd:annex'] >= '6.20170208': # should have fixed remotes from datalad.consts import DATALAD_SPECIAL_REMOTES_UUIDS for remote in special_remotes: eq_(annex.repo.get_description(uuid=DATALAD_SPECIAL_REMOTES_UUIDS[remote]), '[%s]' % remote) #stats = ActivityStats() #output_add[0]['datalad_stats'] = ActivityStats() output_addarchive = list( annex.add_archive_content( existing='archive-suffix', delete=True, strip_leading_dirs=True,)(output_add[0])) assert_equal(output_addarchive, [{'datalad_stats': ActivityStats(add_annex=1, add_git=1, files=3, renamed=2), 'filename': '1.tar'}]) assert_true(annex.repo.dirty) annex.repo.commit("added") ok_file_under_git(annex.repo.path, 'file.txt', annexed=False) ok_file_under_git(annex.repo.path, '1.dat', annexed=True) assert_false(lexists(opj(repo_path, '1.tar'))) assert_false(annex.repo.dirty)
def test_crawl_api_recursive(get_subdatasets_, run_pipeline_, load_pipeline_from_config_, get_repo_pipeline_script_path_, get_lofilename_, chpwd_, tdir): pwd = getpwd() with chpwd(tdir): output, stats = crawl(recursive=True) assert_equal(pwd, getpwd()) if external_versions['mock'] < '1.0.1': raise SkipTest( "needs a more recent mock which throws exceptions in side_effects") assert_equal(output, [[]] * 4 + [None]) # for now output is just a list of outputs assert_equal( stats, ActivityStats( datasets_crawled=5, datasets_crawl_failed=1)) # nothing was done but we got it crawled chpwd_.assert_has_calls([ call(None), call('path1'), call('path1/path1_1'), call('path2'), ], any_order=True) assert_equal( list(find_files('.*', tdir, exclude_vcs=False)), [_path_(tdir, 'some.log')]) # no files were generated besides the log
def test_crawl_s3_commit_versions(path): annex = _annex(path) # Fancier setup so we could do any of desired actions within a single sweep pipeline = [ crawl_s3('datalad-test0-versioned', strategy='commit-versions', repo=annex.repo), switch('datalad_action', { 'commit': annex.finalize(tag=True), 'remove': annex.remove, 'annex': annex, }) ] with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs(new_level=logging.WARN) as cml: out = run_pipeline(pipeline) assert_in("There is already a tag %s" % target_version, cml.out) # things are committed and thus stats are empty eq_(out, [{'datalad_stats': ActivityStats()}]) total_stats = out[0]['datalad_stats'].get_total() eq_(set(total_stats.versions), {target_version}) # we have a bunch of them since not uniq'ing them and they are all the same # override for easier checking total_stats.versions = [] eq_(total_stats, # Deletions come as 'files' as well atm ActivityStats(files=17, overwritten=3, downloaded=14, urls=14, add_annex=14, removed=3, downloaded_size=112)) tags = annex.repo.get_tags(output='name') assert_in(target_version, tags) # and we actually got 7 more commits for t in range(1, 8): assert_in(target_version + "+%d" % t, tags) # if we rerun -- nothing new should have been done. I.e. it is the # and ATM we can reuse the same cassette with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs() as cml: out = run_pipeline(pipeline) assert_not_in("There is already a tag %s" % target_version, cml.out) eq_(out, [{'datalad_stats': ActivityStats(skipped=17)}]) eq_(out[0]['datalad_stats'].get_total(), ActivityStats(skipped=17)) # Really nothing was done
def test_crawl_s3_commit_versions_one_at_a_time(path): annex = _annex(path) # Fancier setup so we could do any of desired actions within a single sweep pipeline = [ crawl_s3('datalad-test0-versioned', strategy='commit-versions', repo=annex.repo, ncommits=1), switch('datalad_action', { 'commit': annex.finalize(tag=True), 'remove': annex.remove, 'annex': annex, }) ] with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs(new_level=logging.WARN) as cml: out = run_pipeline(pipeline) assert_not_in("There is already a tag %s" % target_version, cml.out) # things are committed and thus stats are empty eq_(out, [{'datalad_stats': ActivityStats()}]) total_stats_all = total_stats = out[0]['datalad_stats'].get_total() eq_(total_stats, # Deletions come as 'files' as well atm ActivityStats(files=3, downloaded=3, urls=3, add_annex=3, downloaded_size=24, versions=[target_version])) # and there should be 7 more, every time changing the total stats for t in range(1, 8): with externals_use_cassette('test_crawl_s3-pipeline1'): with swallow_logs(new_level=logging.WARN) as cml: out = run_pipeline(pipeline) assert_in("There is already a tag %s" % target_version, cml.out) total_stats_ = out[0]['datalad_stats'].get_total() assert_not_equal(total_stats, total_stats_) total_stats = total_stats_ total_stats_all += total_stats # with total stats at the end to be the same as if all at once total_stats_all.versions = [] eq_(total_stats_all, # Deletions come as 'files' as well atm ActivityStats(files=17, skipped=72, overwritten=3, downloaded=14, urls=14, add_annex=14, removed=3, downloaded_size=112))
def test_pipeline_dropped_stats(): def n1(data): data['datalad_stats'].increment('add_git') yield data def n2(data): # doesn't care to maintain previous stats yield {'out': 1} pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2]) eq_(pipeline_output, [{ 'datalad_stats': ActivityStats(add_git=1), 'out': 1 }])
def test_pipeline_updated_stats(): def n1(data): data['datalad_stats'].increment('add_git') yield data def n2(data): # doesn't care to maintain previous stats data = data.copy() data['datalad_stats'] = ActivityStats(files=2) data['out'] = 1 yield data pipeline_output = run_pipeline([{'output': 'outputs'}, n1, n2]) eq_(pipeline_output, [{ 'datalad_stats': ActivityStats(files=2, add_git=1), 'out': 1 }])
def _check(version, remaining=None, unlinked=None, **kwargs): """Helper - for a given version and kwargs for remove_ - what to expect""" version_db.version = version data = {'datalad_stats': ActivityStats()} rov = annex.remove_other_versions(db=version_db, **kwargs) # generator with patch('os.unlink', new_callable=Unlinker) as cmunlink, \ patch('os.path.lexists', return_value=True),\ patch('datalad_crawler.nodes.annex.find_files', new_callable=cmunlink.Find_files): out = list(rov(data)) assert_equal(len(out), 1) eq_(out[0]['datalad_stats'].versions, [version]) if remaining is not None: eq_(cmunlink.remaining, remaining) if unlinked is not None: eq_(cmunlink.unlinked, unlinked) eq_(cmunlink.remaining.union(cmunlink.unlinked), cmunlink.allfiles)
def test_pipeline_stats_persist(): # to test that we would get proper stats returned in various pipeline layouts def n1(data): data['datalad_stats'].increment('add_git') yield data def p(data): yield data def n2(data): # doesn't care to maintain previous stats data['datalad_stats'].increment('add_annex') yield data target_stats = ActivityStats(add_git=1, add_annex=1) def assert_pipeline(pipeline): eq_(run_pipeline(pipeline), [{'datalad_stats': target_stats}]) assert_pipeline([n1, n2]) assert_pipeline([n1, [n2]]) assert_pipeline([[n1], [n2]]) assert_pipeline([n1, [n2, p]]) assert_pipeline([[n1], n2]) assert_pipeline([[n1, p], n2])
def xrun_pipeline(pipeline, data=None, stats=None, reset=True): """Yield results from the pipeline. """ id_pipeline = "Pipe #%s" % id(pipeline) def _log(msg, *args): """Helper for uniform debug messages""" lgr.log(5, "%s: " + msg, id_pipeline, *args) _log("%s", pipeline) if reset: _log("Resetting pipeline") reset_pipeline(pipeline) # just for paranoids and PEP8-disturbed, since theoretically every node # should not change the data, so having default {} should be sufficient data = data or {} if 'datalad_stats' in data: if stats is not None: raise ValueError( "We were provided stats to use, but data has already datalad_stats" ) else: data = updated(data, {'datalad_stats': stats or ActivityStats()}) if not len(pipeline): return # options for this pipeline opts, pipeline = _get_pipeline_opts(pipeline) # verify that we know about all specified options unknown_opts = set(opts).difference(set(PIPELINE_OPTS)) if unknown_opts: raise ValueError("Unknown pipeline options %s" % str(unknown_opts)) data_to_process = [data] output = opts['output'] if output not in ('input', 'last-output', 'outputs', 'input+outputs'): raise ValueError("Unknown output=%r" % output) if opts['loop'] and output == 'input': lgr.debug( "Assigning output='last-output' for sub-pipeline since we want " "to loop until pipeline returns anything") output_sub = 'last-output' else: output_sub = output log_level = lgr.getEffectiveLevel() data_out = None while data_to_process: _log("processing data. %d left to go", len(data_to_process)) data_in = data_to_process.pop(0) try: for idata_out, data_out in enumerate( xrun_pipeline_steps(pipeline, data_in, output=output_sub)): if log_level <= 3: # provide details of what keys got changed # TODO: unify with 2nd place where it was invoked lgr.log(3, "O3: +%s, -%s, ch%s, ch?%s", *_compare_dicts(data_in, data_out)) _log("got new %dth output", idata_out) if opts['loop']: _log( "extending list of data to process due to loop option") data_to_process.append(data_out) if 'outputs' in output: _log("yielding output") yield data_out except FinishPipeline as e: # TODO: decide what we would like to do -- skip that particular pipeline run # or all subsequent or may be go back and only skip that generated result _log("got a signal that pipeline is 'finished'") # TODO: this implementation is somewhat bad since all the output logic is # duplicated within xrun_pipeline_steps, but it is probably unavoidable because of # loop option if output == 'last-output': if data_out: _log("yielding last-output") yield data_out # Input should be yielded last since otherwise it might ruin the flow for typical # pipelines which do not expect anything beyond going step by step # We should yield input data even if it was empty if 'input' in output: _log("finally yielding input data as instructed") yield data
def test_remove_other_versions(repo_path): annex = Annexificator(path=repo_path, create=True) class version_db: version = '1.0.0' versions = OrderedDict([ ('10.0.0', {}), # wrong order -- will fail and we will pop it ('1.0.0', { 'a': 'a_1.0.0' }), ('2.0.0', { 'b': 'b_2.0.0', 'c': 'c_2.0.0' }), ('2.0.1', { 'c': 'c_2.0.1' }), # if no overlay, b is gone, overlay=2 should keep b ('2.1.1', { 'c': 'c_2.1.1' }), # b should disappear if overlay=2, but stay if ('3', { 'd': 'd_3' }), ]) assert_raises(ValueError, annex.remove_other_versions, 'name', db=version_db) data = {'datalad_stats': ActivityStats()} rov = annex.remove_other_versions(db=version_db) # generator # we have that incorrectly ordered version inside assert_raises(AssertionError, next, rov(data)) version_db.versions.pop('10.0.0') # remove the abuser class Unlinker(object): def __init__(self): self.unlinked = [] self.allfiles = set() # Let's also record all present files for vfs in version_db.versions.values(): self.allfiles.update(vfs.values()) self._remaining = self.allfiles.copy() def Find_files(self): """To provide another mock callable""" def find_files(*args, **kwargs): assert_equal(kwargs.get('topdir'), repo_path) # return full path return [opj(repo_path, x) for x in self._remaining] return find_files def __call__(self, s): bs = basename(s) assert (bs in self._remaining) self.unlinked.append(bs) self._remaining.remove(bs) @property def remaining(self): # strip the repopath return self._remaining def _check(version, remaining=None, unlinked=None, **kwargs): """Helper - for a given version and kwargs for remove_ - what to expect""" version_db.version = version data = {'datalad_stats': ActivityStats()} rov = annex.remove_other_versions(db=version_db, **kwargs) # generator with patch('os.unlink', new_callable=Unlinker) as cmunlink, \ patch('os.path.lexists', return_value=True),\ patch('datalad_crawler.nodes.annex.find_files', new_callable=cmunlink.Find_files): out = list(rov(data)) assert_equal(len(out), 1) eq_(out[0]['datalad_stats'].versions, [version]) if remaining is not None: eq_(cmunlink.remaining, remaining) if unlinked is not None: eq_(cmunlink.unlinked, unlinked) eq_(cmunlink.remaining.union(cmunlink.unlinked), cmunlink.allfiles) def check(*args, **kwargs): _check(*args, **kwargs) kwargs = kwargs.copy() # without remove unversioned # in our test results should be the same kwargs['remove_unversioned'] = True _check(*args, **kwargs) # remove_unversioned=True check('1.0.0', remaining={'a_1.0.0'}) # even though due to overlay=0 all versions are identical, there were no # version before 1.0.0, and all later are removed regardless or overlay check('1.0.0', remaining={'a_1.0.0'}, overlay=0) check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'}) check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'}, overlay=1) check('2.0.0', remaining={'b_2.0.0', 'c_2.0.0'}, overlay=lambda x: x[:1]) # but with overlay=0 we would also get files from 1 check('2.0.0', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.0'}, overlay=0) check('2.0.1', remaining={'c_2.0.1'}) check('2.0.1', remaining={'b_2.0.0', 'c_2.0.1'}, overlay=1) check('2.0.1', remaining={'b_2.0.0', 'c_2.0.1'}, overlay=2) check('2.0.1', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.1'}, overlay=0) check('2.1.1', remaining={'c_2.1.1'}) check('2.1.1', remaining={'c_2.1.1'}, overlay=2) check('2.1.1', remaining={'b_2.0.0', 'c_2.1.1'}, overlay=1) check('2.1.1', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.1.1'}, overlay=0) check('3', remaining={'d_3'}) check('3', remaining={'d_3'}, overlay=1) check('3', remaining={'d_3'}, overlay=2) check('3', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.1.1', 'd_3'}, overlay=0) # if name changes version_db.versions['2.1.1'] = {'c00.dat': 'c00_2.1.1'} # if we don't do anything special, it would have its own life: check('2.1.1', remaining={'c00_2.1.1'}) check('2.1.1', remaining={'c00_2.1.1'}, overlay=2) check('2.1.1', remaining={'b_2.0.0', 'c_2.0.1', 'c00_2.1.1'}, overlay=1) check('2.1.1', remaining={'a_1.0.0', 'b_2.0.0', 'c_2.0.1', 'c00_2.1.1'}, overlay=0) # but we should be able to specify to "unify" unversioned name more by providing # replacement pattern kw = dict(fpath_subs=[('c00', 'c'), ('\.dat', '')]) check('2.1.1', remaining={'b_2.0.0', 'c00_2.1.1'}, overlay=1, **kw) check('2.1.1', remaining={'a_1.0.0', 'b_2.0.0', 'c00_2.1.1'}, overlay=0, **kw)
def _test_annex_file(mode, topdir, topurl, outdir): annex = Annexificator(path=outdir, mode=mode, statusdb='fileattr', largefiles="exclude=*.txt") input = {'url': "%sd1/1.dat" % topurl, 'filename': '1-copy.dat'} tfile = opj(outdir, '1-copy.dat') # we add full filepath now expected_output = [dict(filepath=opj(outdir, input['filename']), **input)] output = list(annex(input)) assert_equal(expected_output, output) # addurl is batched, and we haven't forced annex flushing so there should # be a batched process if not annex.repo.fake_dates_enabled: assert_equal(len(annex.repo._batched), 1) # if we finalize, it should flush batched annexes and commit list(annex.finalize()({})) assert (lexists(tfile)) ok_file_under_git(tfile, annexed=True) if mode == 'full': ok_file_has_content(tfile, '1.dat load') else: # in fast or relaxed mode there must not be any content assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load') whereis = annex.repo.whereis(tfile) assert_in(annex.repo.WEB_UUID, whereis) # url must have been added assert_equal(len(whereis), 1 + int(mode == 'full')) # TODO: check the url # Neither file should not be attempted to download again, since nothing changed # and by default we do use files db output = list(annex(input)) assert_equal(output, []) # nothing was done, so annex didn't yield data annex.yield_non_updated = True input_with_stats = input.copy() input_with_stats['datalad_stats'] = ActivityStats() output = list(annex(input_with_stats)) assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, urls=1, skipped=1)) # but if we change that file, it should re-download it now with open(opj(topdir, 'd1', '1.dat'), 'a') as f: f.write("+") output = list(annex(input_with_stats)) stats = output[0]['datalad_stats'] stats.downloaded_time = 0 # 2 since we are reusing the same stats download_stats = dict(downloaded=1, downloaded_size=11) if mode == 'full' else {} addskip_stats = dict(add_annex=0, skipped=2, overwritten=0) if mode == 'relaxed' else dict( add_annex=1, skipped=1, overwritten=1) kwargs = download_stats.copy() kwargs.update(addskip_stats) assert_equal(stats, ActivityStats(files=2, urls=2, **kwargs)) # Download into a file which will be added to git # TODO: for now added to git only in full mode. in --fast or --relaxed, still goes to annex # http://git-annex.branchable.com/bugs/treatment_of_largefiles_is_not_working_for_addurl_--fast___40__or_--relaxed__41__/ input = { 'url': "%sd1/1.dat" % topurl, 'filename': '1.txt', 'datalad_stats': ActivityStats() } tfile = opj(outdir, '1.txt') output = list(annex(input)) annexed = mode not in {'full'} list(annex.finalize()({})) if not annexed: ok_file_has_content(tfile, '1.dat load+') else: assert_raises(AssertionError, ok_file_has_content, tfile, '1.dat load+') ok_file_under_git(tfile, annexed=annexed) assert_equal(len(output), 1) stats = output[0]['datalad_stats'] # reset varying metric stats.downloaded_time = 0 assert_equal( stats, ActivityStats(files=1, urls=1, add_git=1 - int(annexed), add_annex=int(annexed), **download_stats)) # Let's add a file without specifying URL sfilepath = opj(outdir, 'sample.txt') with open(sfilepath, 'w') as f: f.write("sample") ok_file_has_content(sfilepath, "sample") output = list( annex({ 'filename': 'sample.txt', 'datalad_stats': ActivityStats() })) ok_file_under_git(sfilepath, annexed=False) assert (output) assert_equal(output[0]['datalad_stats'], ActivityStats(files=1, add_git=1))
def n2(data): # doesn't care to maintain previous stats data = data.copy() data['datalad_stats'] = ActivityStats(files=2) data['out'] = 1 yield data
def test_openfmri_pipeline2(ind, topurl, outd): # no versioned files -- should still work! ;) list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # backend set, dataset init, crawler, init, incoming (shares with master -1), # (2 or 3 commits, depending on create variant) # incoming-processed, merge, aggregate metadata: ncommits_master = len(commits_hexsha['master']) assert_in(ncommits_master, [5, 6]) assert_in(len(commits_l['master']), [4, 5]) eq_(len(commits_hexsha['incoming']), ncommits_master - 2) eq_(len(commits_l['incoming']), ncommits_master - 2) eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 1) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_l['incoming-processed']), ncommits_master - 2) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new eq_(out[0]['datalad_stats'], ActivityStats(files=2, skipped=2, urls=2)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) os.rename(opj(ind, 'ds666', 'ds666_R2.0.0.tar.gz'), opj(ind, 'ds666', 'ds666.tar.gz')) with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) eq_(out[0]['datalad_stats'], ActivityStats()) # was committed stats_total = out[0]['datalad_stats'].get_total() stats_total.downloaded_size = 0 eq_( stats_total, ActivityStats(files=4, overwritten=1, skipped=1, downloaded=1, merges=[['incoming', 'incoming-processed']], versions=['1.0.0'], renamed=1, urls=2, add_annex=2)) # in reality there is also 1.0.0+1 tag since file changed but no version suffix eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1']) check_dropall_get(repo)
def test_openfmri_pipeline1(ind, topurl, outd, clonedir): index_html = opj(ind, 'ds666', 'index.html') list( initiate_dataset(template="openfmri", dataset_name='dataladtest-ds666', path=outd, data_fields=['dataset'])({ 'dataset': 'ds666' })) repo = AnnexRepo(outd, create=False) # to be used in the checks # Since datalad 0.11.2 all .metadata/objects go under annex. # Here we have a test where we force drop all annexed content, # to mitigate that let's place all metadata under git dotdatalad_attributes_file = opj('.datalad', '.gitattributes') repo.set_gitattributes([('metadata/objects/**', { 'annex.largefiles': '(nothing)' })], dotdatalad_attributes_file) # --amend so we do not cause change in # of commits below repo.commit("gitattributes", files=dotdatalad_attributes_file, options=['--amend']) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) t1w_fpath_nover = opj(outd, 'sub1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath_nover, "mighty load in old format") # # And now versioned files were specified! # add_to_index(index_html, content=_versioned_files) with chpwd(outd): pipeline = ofpipeline('ds666', versioned_urls=False, topurl=topurl) out = run_pipeline(pipeline) eq_(len(out), 1) ok_( not exists(t1w_fpath_nover), "%s file should no longer be there if unversioned files get removed correctly" % t1w_fpath_nover) repo = AnnexRepo(outd, create=False) # to be used in the checks # Inspect the tree -- that we have all the branches branches = {'master', 'incoming', 'incoming-processed', 'git-annex'} eq_(set(repo.get_branches()), branches) # We do not have custom changes in master yet, so it just follows incoming-processed atm # eq_(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # Since we did initiate_dataset -- now we have separate master! assert_not_equal(repo.get_hexsha('master'), repo.get_hexsha('incoming-processed')) # and that one is different from incoming assert_not_equal(repo.get_hexsha('incoming'), repo.get_hexsha('incoming-processed')) # actually the tree should look quite neat with 1.0.0 tag having 1 parent in incoming # 1.0.1 having 1.0.0 and the 2nd commit in incoming as parents commits_hexsha = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } # all commits out there: # dataset init, crawler init # (2 commits) # + 3*(incoming, processed, merge) # + 3*aggregate-metadata update # - 1 since now that incoming starts with master, there is one less merge # In --incremental mode there is a side effect of absent now # 2*remove of obsolete metadata object files, # see https://github.com/datalad/datalad/issues/2772 # TODO inspect by knowledgeable person and re-enable #ncommits_master = len(commits_hexsha['master']) #assert_in(ncommits_master, [13, 14]) #assert_in(len(commits_l['master']), [8, 9]) # TODO inspect by knowledgeable person and re-enable #eq_(len(commits_hexsha['incoming']), ncommits_master - 8) #eq_(len(commits_l['incoming']), ncommits_master - 8) #eq_(len(commits_hexsha['incoming-processed']), ncommits_master - 5) #eq_(len(commits_l['incoming-processed']), ncommits_master - 8) # Check tags for the versions eq_(out[0]['datalad_stats'].get_total().versions, ['1.0.0', '1.0.1']) # +1 because original "release" was assumed to be 1.0.0 repo_tags = repo.get_tags() eq_(repo.get_tags(output='name'), ['1.0.0', '1.0.0+1', '1.0.1']) # Ben: The tagged ones currently are the ones with the message # '[DATALAD] dataset aggregate metadata update\n': #eq_(repo_tags[0]['hexsha'], commits_l['master'][4]) # next to the last one #eq_(repo_tags[-1]['hexsha'], commits_l['master'][0]) # the last one def hexsha(l): return l.__class__(x.hexsha for x in l) # TODO requires additional tooling to re-enable ## Verify that we have desired tree of merges #eq_(hexsha(commits_l['incoming-processed'][0].parents), (commits_l['incoming-processed'][1], # commits_l['incoming'][0])) #eq_(hexsha(commits_l['incoming-processed'][2].parents), (commits_l['incoming-processed'][3], # also in master # commits_l['incoming'][2],)) # ben: The following two comparisons are targeting these commits: # commit "Merge branch 'incoming-processed'\n" in commits_l['master'], # parents are: # commit "[DATALAD] dataset aggregate metadata update\n" in commits_l['master'] and # commit "[DATALAD] Added files from extracted archives\n\nFiles processed: 6\n renamed: 2\n +annex: 3\nBranches merged: incoming->incoming-processed\n" in commits_l['incoming-processed'] # TODO requires additional tooling to re-enable #eq_(hexsha(commits_l['master'][1].parents), (commits_l['master'][2], # commits_l['incoming-processed'][0])) #eq_(hexsha(commits_l['master'][3].parents), (commits_l['master'][4], # commits_l['incoming-processed'][1])) with chpwd(outd): eq_(set(glob('*')), {'changelog.txt', 'sub-1'}) all_files = sorted(find_files('.')) t1w_fpath = opj(outd, 'sub-1', 'anat', 'sub-1_T1w.dat') ok_file_has_content(t1w_fpath, "mighty load 1.0.1") ok_file_under_git(opj(outd, 'changelog.txt'), annexed=False) ok_file_under_git(t1w_fpath, annexed=True) try: # this is the new way from datalad.metadata.metadata import get_ds_aggregate_db_locations ds = Dataset('.') dbloc, objbase = get_ds_aggregate_db_locations(ds) dbloc = op.relpath(dbloc, start=ds.path) except ImportError: # this stopped working in early 2019 versions of datalad from datalad.metadata.metadata import agginfo_relpath dbloc = agginfo_relpath target_files = { './.datalad/config', './.datalad/crawl/crawl.cfg', # no more! # './.datalad/config.ttl', './.datalad/datalad.ttl', './.datalad/crawl/statuses/incoming.json', './.datalad/crawl/versions/incoming.json', './changelog.txt', './sub-1/anat/sub-1_T1w.dat', './sub-1/beh/responses.tsv', './' + dbloc, } target_incoming_files = { '.gitattributes', # we marked default backend right in the incoming # we now base 'incoming' on master branch, so we get all those as well '.datalad/.gitattributes', '.datalad/config', '.datalad/crawl/crawl.cfg', 'changelog.txt', 'ds666.tar.gz', 'ds666-beh_R1.0.1.tar.gz', 'ds666_R1.0.0.tar.gz', 'ds666_R1.0.1.tar.gz', 'ds666_R2.0.0.tar.gz', '.datalad/crawl/statuses/incoming.json', '.datalad/crawl/versions/incoming.json' } # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # check that -beh was committed in 2nd commit in incoming, not the first one assert_not_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][-1])) assert_in('ds666-beh_R1.0.1.tar.gz', repo.get_files(commits_l['incoming'][0])) # rerun pipeline -- make sure we are on the same in all branches! with chpwd(outd): out = run_pipeline(pipeline) eq_(len(out), 1) commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} eq_(commits_hexsha, commits_hexsha_) # i.e. nothing new # actually we do manage to add_git 1 (README) since it is generated committed directly to git # BUT now fixed -- if not committed (was the same), should be marked as skipped # Nothing was committed so stats leaked all the way up eq_(out[0]['datalad_stats'], ActivityStats(files=5, skipped=5, urls=5)) eq_(out[0]['datalad_stats'], out[0]['datalad_stats'].get_total()) # rerun pipeline when new content is available # add new revision, rerun pipeline and check that stuff was processed/added correctly add_to_index( index_html, content= '<a href="ds666_R2.0.0.tar.gz">Raw data on AWS version 2.0.0</a>') with chpwd(outd): out = run_pipeline(pipeline) all_files_updated = sorted(find_files('.')) eq_(len(out), 1) assert_not_equal(out[0]['datalad_stats'].get_total(), ActivityStats()) # there is no overlays ATM, so behav would be gone since no 2.0.0 for it! target_files.remove('./sub-1/beh/responses.tsv') # Ben: metadata object files may differ in their names containing some checksum-ish shit ... # TODO: Check how those names are constructed and may be at least count the number of created object files in addition to that comparison eq_( set([ f for f in all_files_updated if not f.startswith('./.datalad/metadata/objects/') ]), target_files) # new instance so it re-reads git stuff etc # repo = AnnexRepo(outd, create=False) # to be used in the checks commits_hexsha_ = {b: list(_get_branch_commits(repo, b)) for b in branches} commits_l_ = { b: list(_get_branch_commits(repo, b, limit='left-only')) for b in branches } assert_not_equal(commits_hexsha, commits_hexsha_) eq_(out[0]['datalad_stats'], ActivityStats()) # commit happened so stats were consumed # numbers seems to be right total_stats = out[0]['datalad_stats'].get_total() # but for some reason downloaded_size fluctuates.... why? probably archiving...? total_stats.downloaded_size = 0 eq_( total_stats, ActivityStats( files=8, skipped=5, downloaded=1, renamed=1, urls=6, add_annex=2, # add_git=1, # README versions=['2.0.0'], merges=[['incoming', 'incoming-processed']])) check_dropall_get(repo) # Let's see if pipeline would remove files we stopped tracking remove_from_index(index_html, '<a href=.ds666_R1.0.0[^<]*</a>') with chpwd(outd): with swallow_logs(new_level=logging.WARNING) as cml: out = run_pipeline(pipeline) # since files get removed in incoming, but repreprocessed completely # incomming-processed and merged into master -- new commits will come # They shouldn't have any difference but still should be new commits assert_in("There is already a tag 2.0.0 in the repository", cml.out) eq_(len(out), 1) incoming_files = repo.get_files('incoming') target_incoming_files.remove('ds666_R1.0.0.tar.gz') eq_(set(incoming_files), target_incoming_files) commits_hexsha_removed = { b: list(_get_branch_commits(repo, b)) for b in branches } # our 'statuses' database should have recorded the change thus got a diff # which propagated through all branches for b in 'master', 'incoming-processed': # with non persistent DB we had no changes # eq_(repo.repo.branches[b].commit.diff(commits_hexsha_[b][0]), []) assert_in(repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.diff(b, commits_hexsha_[b][0])) dincoming = repo.diff('incoming', commits_hexsha_['incoming'][0]) eq_(len(dincoming), 2) # 2 diff objects -- 1 file removed, 1 statuses updated eq_( set(dincoming.keys()), { repo.pathobj / '.datalad/crawl/statuses/incoming.json', repo.pathobj / 'ds666_R1.0.0.tar.gz' }) eq_(out[0]['datalad_stats'].get_total().removed, 1) assert_not_equal(commits_hexsha_, commits_hexsha_removed) # we will check if a clone would be crawling just as good from datalad.api import crawl # make a brand new clone GitRepo.clone(outd, clonedir) def _pipeline(*args, **kwargs): """Helper to mock openfmri.pipeline invocation so it looks at our 'server'""" kwargs = updated(kwargs, {'topurl': topurl, 'versioned_urls': False}) return ofpipeline(*args, **kwargs) with chpwd(clonedir), patch.object(openfmri, 'pipeline', _pipeline): output, stats = crawl( ) # we should be able to recrawl without doing anything ok_(stats, ActivityStats(files=6, skipped=6, urls=5))
def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad.crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad.crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from ..distribution.dataset import Dataset from ..api import crawl from ..utils import swallow_logs from ..dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).get_subdatasets(recursive=recursive) lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total
def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
@with_tree( tree={ 'pipeline.py': 'pipeline = lambda: [1]', 'pipeline2.py': 'pipeline = lambda x: [2*x]', }) def test_load_pipeline_from_script(d): eq_(load_pipeline_from_module(opj(d, 'pipeline.py')), [1]) eq_(load_pipeline_from_module(opj(d, 'pipeline2.py'), kwargs=dict(x=2)), [4]) assert_raises(RuntimeError, load_pipeline_from_module, opj(d, 'unlikelytobethere.py')) DEFAULT_OUTPUT = [{'datalad_stats': ActivityStats()}] def _out(ld): """Adjust output entry to include default outputs as well """ outl = [] for d in ld: out = d.copy() outl.append(out) for k, v in DEFAULT_OUTPUT[0].items(): if k not in out: out[k] = v return outl