def test_add_delete_after_and_drop(self): # To test that .tar gets removed # but that new stuff was added to annex repo. We know the key since default # backend and content remain the same key1 = 'SHA256E-s5--16d3ad1974655987dd7801d70659990b89bfe7e931a0a358964e64e901761cc0.dat' # previous state of things: prev_files = list(find_files('.*', self.annex.path)) assert_equal(self.annex.whereis(key1, key=True, output='full'), {}) commits_prior = list(self.annex.get_branch_commits_('git-annex')) add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True) commits_after = list(self.annex.get_branch_commits_('git-annex')) # There should be a single commit for all additions +1 to initiate datalad-archives gh-1258 # If faking dates, there should be another +1 because # annex.alwayscommit isn't set to false. assert_equal(len(commits_after), # We expect one more when faking dates because # annex.alwayscommit isn't set to false. len(commits_prior) + 2 + self.annex.fake_dates_enabled) assert_equal(prev_files, list(find_files('.*', self.annex.path))) w = self.annex.whereis(key1, key=True, output='full') assert_equal(len(w), 2) # in archive, and locally since we didn't drop # Let's now do the same but also drop content add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete_after=True, drop_after=True) assert_equal(prev_files, list(find_files('.*', self.annex.path))) w = self.annex.whereis(key1, key=True, output='full') assert_equal(len(w), 1) # in archive # there should be no .datalad temporary files hanging around self.assert_no_trash_left_behind()
def test_add_delete(self): # To test that .tar gets removed add_archive_content('1.tar', annex=self.annex, strip_leading_dirs=True, delete=True) assert_false(lexists(opj(self.annex.path, '1.tar')))
def test_add_archive_content_zip(repo_path=None): ds = Dataset(repo_path).create(force=True) with chpwd(repo_path): with swallow_outputs(): ds.save("1.zip", message="add 1.zip") add_archive_content("1.zip") ok_file_under_git(ds.pathobj / "1" / "foo", annexed=True) ok_file_under_git(ds.pathobj / "1" / "dir" / "bar", annexed=True) ok_archives_caches(ds.path, 0)
def test_add_archive_content_zip(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): with swallow_outputs(): repo.add(["1.zip"]) repo.commit("add 1.zip") add_archive_content("1.zip") ok_file_under_git(opj(repo.path, "1", "foo"), annexed=True) ok_file_under_git(opj("1", "dir", "bar"), annexed=True) ok_archives_caches(repo.path, 0)
def test_annex_get_from_subdir(topdir): from datalad.api import add_archive_content annex = AnnexRepo(topdir, init=True) annex.add('a.tar.gz', commit=True) add_archive_content('a.tar.gz', annex=annex, delete=True) fpath = opj(topdir, 'a', 'd', fn_inarchive_obscure) with chpwd(opj(topdir, 'a', 'd')): runner = Runner() runner(['git', 'annex', 'drop', fn_inarchive_obscure]) # run git annex drop assert_false(annex.file_has_content(fpath)) # and verify if file deleted from directory runner(['git', 'annex', 'get', fn_inarchive_obscure]) # run git annex get assert_true(annex.file_has_content(fpath)) # and verify if file got into directory
def test_add_archive_content_absolute_path(path): repo = AnnexRepo(opj(path, "ds"), create=True) repo.add(["1.tar.gz"]) repo.commit("1.tar.gz") abs_tar_gz = opj(path, "ds", "1.tar.gz") add_archive_content(abs_tar_gz, annex=repo) ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True) commit_msg = repo.format_commit("%B") # The commit message uses relative paths. assert_not_in(abs_tar_gz, commit_msg) assert_in("1.tar.gz", commit_msg) with assert_raises(FileNotInRepositoryError): add_archive_content(opj(path, "notds", "2.tar.gz"), annex=repo)
def test_add_archive_use_archive_dir(repo_path): repo = AnnexRepo(repo_path, create=True) with chpwd(repo_path): # Let's add first archive to the repo with default setting archive_path = opj('4u', '1.tar.gz') # check it gives informative error if archive is not already added with assert_raises(RuntimeError) as cmr: add_archive_content(archive_path) assert_re_in( "You should run ['\"]datalad save 4u\\\\1\\.tar\\.gz['\"] first" if on_windows else "You should run ['\"]datalad save 4u/1\\.tar\\.gz['\"] first", str(cmr.exception), match=False) with swallow_outputs(): repo.add(archive_path) repo.commit("added 1.tar.gz") ok_archives_caches(repo.path, 0) add_archive_content(archive_path, strip_leading_dirs=True, use_current_dir=True) ok_(not exists(opj('4u', '1 f.txt'))) ok_file_under_git(repo.path, '1 f.txt', annexed=True) ok_archives_caches(repo.path, 0) # and now let's extract under archive dir add_archive_content(archive_path, strip_leading_dirs=True) ok_file_under_git(repo.path, opj('4u', '1 f.txt'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content(opj('4u', 'sub.tar.gz')) ok_file_under_git(repo.path, opj('4u', 'sub', '2 f.txt'), annexed=True) ok_archives_caches(repo.path, 0)
def test_add_archive_content_strip_leading(path_orig, url, repo_path): with chpwd(repo_path): repo = AnnexRepo(repo_path, create=True) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) repo.commit("added 1.tar.gz") add_archive_content('1.tar.gz', strip_leading_dirs=True) ok_(not exists('1')) ok_file_under_git(repo.path, '1 f.txt', annexed=True) ok_file_under_git('d', '1d', annexed=True) ok_archives_caches(repo.path, 0)
def test_add_archive_dirs(path_orig=None, url=None, repo_path=None): # change to repo_path with chpwd(repo_path): # create annex repo ds = Dataset(repo_path).create(force=True) repo = ds.repo # add archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) repo.commit("added 1.tar.gz") # test with excludes and annex options add_archive_content( '1.tar.gz', existing='archive-suffix', # Since inconsistent and seems in many cases no # leading dirs to strip, keep them as provided strip_leading_dirs=True, delete=True, leading_dirs_consider=['crcns.*', '1'], leading_dirs_depth=2, use_current_dir=False, exclude='.*__MACOSX.*') # some junk penetrates eq_( repo.get_description( uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]), '[%s]' % ARCHIVES_SPECIAL_REMOTE) all_files = sorted(find_files('.')) # posixify paths to make it work on Windows as well all_files = [Path(file).as_posix() for file in all_files] target_files = { 'CR24A/behaving1/1 f.txt', 'CR24C/behaving3/3 f.txt', 'CR24D/behaving2/2 f.txt', '.datalad/config', } eq_(set(all_files), target_files) # regression test: the subdir in MACOSX wasn't excluded and its name was # getting stripped by leading_dir_len # if stripping and exclude didn't work this fails assert_false(exists('__MACOSX')) # if exclude doesn't work then name of subdir gets stripped by # leading_dir_len assert_false(exists('c-1_data')) # if exclude doesn't work but everything else works this fails assert_false(exists('CR24B'))
def test_add_archive_leading_dir(self): import os os.mkdir(opj(self.annex.path, 'sub')) f123 = opj('sub', '123.tar') os.rename(opj(self.annex.path, '1.tar'), opj(self.annex.path, f123)) self.annex.remove('1.tar', force=True) self.annex.add(f123) self.annex.commit(msg="renamed") add_archive_content(f123, annex=self.annex, add_archive_leading_dir=True, strip_leading_dirs=True) ok_file_under_git(self.annex.path, opj('sub', '123', 'file.txt'), annexed=True)
def test_add_archive_dirs(path_orig, url, repo_path): # change to repo_path with chpwd(repo_path): # create annex repo repo = AnnexRepo(repo_path, create=True) # add archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) repo.commit("added 1.tar.gz") # test with excludes and annex options add_archive_content( '1.tar.gz', existing='archive-suffix', # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided strip_leading_dirs=True, delete=True, leading_dirs_consider=['crcns.*', '1'], leading_dirs_depth=2, use_current_dir=False, exclude='.*__MACOSX.*') # some junk penetrates if external_versions['cmd:annex'] >= '6.20170208': # should have fixed remotes eq_( repo.get_description( uuid=DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE] ), '[%s]' % ARCHIVES_SPECIAL_REMOTE) all_files = sorted(find_files('.')) target_files = { './CR24A/behaving1/1 f.txt', './CR24C/behaving3/3 f.txt', './CR24D/behaving2/2 f.txt', } eq_(set(all_files), target_files) # regression test: the subdir in MACOSX wasn't excluded and its name was getting stripped by leading_dir_len assert_false(exists( '__MACOSX')) # if stripping and exclude didn't work this fails assert_false( exists('c-1_data') ) # if exclude doesn't work then name of subdir gets stripped by leading_dir_len assert_false( exists('CR24B') ) # if exclude doesn't work but everything else works this fails
def test_add_archive_content_strip_leading(path_orig=None, url=None, repo_path=None): with chpwd(repo_path): ds = Dataset(repo_path).create(force=True) repo = ds.repo # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) repo.commit("added 1.tar.gz") add_archive_content('1.tar.gz', strip_leading_dirs=True) ok_(not exists('1')) ok_file_under_git(ds.path, '1 f.txt', annexed=True) ok_file_under_git('d', '1d', annexed=True) ok_archives_caches(ds.path, 0)
def test_override_existing_under_git(self): create_tree(self.annex.path, {'1.dat': 'load2'}) self.annex.add('1.dat', git=True) self.annex.commit('added to git') add_archive_content( '1.tar', annex=self.annex, strip_leading_dirs=True, ) # and we did not bother adding it to annex (for now) -- just skipped # since we have it and it is the same ok_file_under_git(self.annex.path, '1.dat', annexed=False) # but if we say 'overwrite' -- we would remove and replace add_archive_content( '1.tar', annex=self.annex, strip_leading_dirs=True, delete=True , existing='overwrite' ) ok_file_under_git(self.annex.path, '1.dat', annexed=True)
def test_add_archive_use_archive_dir(repo_path=None): ds = Dataset(repo_path).create(force=True) with chpwd(repo_path): # Let's add first archive to the repo with default setting archive_path = opj('4u', '1.tar.gz') # check it gives informative error if archive is not already added res = add_archive_content(archive_path, on_failure='ignore') message = \ "Can not add an untracked archive. Run 'datalad save 4u\\1.tar.gz'"\ if on_windows else \ "Can not add an untracked archive. Run 'datalad save 4u/1.tar.gz'" assert_in_results(res, action='add-archive-content', message=message, status='impossible') with swallow_outputs(): ds.save(archive_path) ok_archives_caches(ds.path, 0) add_archive_content(archive_path, strip_leading_dirs=True, use_current_dir=True) ok_(not exists(opj('4u', '1 f.txt'))) ok_file_under_git(ds.path, '1 f.txt', annexed=True) ok_archives_caches(ds.path, 0) # and now let's extract under archive dir add_archive_content(archive_path, strip_leading_dirs=True) ok_file_under_git(ds.path, opj('4u', '1 f.txt'), annexed=True) ok_archives_caches(ds.path, 0) add_archive_content(opj('4u', 'sub.tar.gz')) ok_file_under_git(ds.path, opj('4u', 'sub', '2 f.txt'), annexed=True) ok_archives_caches(ds.path, 0)
def test_annex_get_from_subdir(topdir): from datalad.api import add_archive_content annex = AnnexRepo(topdir, backend='MD5E', init=True) annex.add('a.tar.gz') annex.commit() add_archive_content('a.tar.gz', annex=annex, delete=True) fpath = op.join(topdir, 'a', 'd', fn_in_archive_obscure) with chpwd(op.join(topdir, 'a', 'd')): runner = WitlessRunner() runner.run(['git', 'annex', 'drop', '--', fn_in_archive_obscure], protocol=KillOutput) # run git annex drop assert_false(annex.file_has_content( fpath)) # and verify if file deleted from directory runner.run(['git', 'annex', 'get', '--', fn_in_archive_obscure], protocol=KillOutput) # run git annex get assert_true(annex.file_has_content( fpath)) # and verify if file got into directory
def test_add_delete_after_and_drop_subdir(self): os.mkdir(opj(self.annex.path, 'subdir')) mv_out = self.annex.call_git( ['mv', '1.tar', 'subdir'] ) self.annex.commit("moved into subdir") with chpwd(self.annex.path): # was failing since deleting without considering if tarball # was extracted in that tarball directory commits_prior_master = list(self.annex.get_branch_commits_()) commits_prior = list(self.annex.get_branch_commits_('git-annex')) add_out = add_archive_content( opj('subdir', '1.tar'), delete_after=True, drop_after=True) assert_repo_status(self.annex.path) commits_after_master = list(self.annex.get_branch_commits_()) commits_after = list(self.annex.get_branch_commits_('git-annex')) # There should be a single commit for all additions +1 to # initiate datalad-archives gh-1258. If faking dates, # there should be another +1 because annex.alwayscommit # isn't set to false. assert_equal(len(commits_after), len(commits_prior) + 2 + self.annex.fake_dates_enabled) assert_equal(len(commits_after_master), len(commits_prior_master)) assert(add_out is self.annex) # there should be no .datalad temporary files hanging around self.assert_no_trash_left_behind() # and if we add some untracked file, redo, there should be no changes # to master and file should remain not committed create_tree(self.annex.path, {'dummy.txt': '123'}) assert_true(self.annex.dirty) # untracked file add_out = add_archive_content( opj('subdir', '1.tar'), delete_after=True, drop_after=True, allow_dirty=True) assert_repo_status(self.annex.path, untracked=['dummy.txt']) assert_equal(len(list(self.annex.get_branch_commits_())), len(commits_prior_master)) # there should be no .datalad temporary files hanging around self.assert_no_trash_left_behind()
def check_observe_tqdm(topdir, topurl, outdir): # just a helper to enable/use when want quickly to get some # repository with archives and observe tqdm from datalad.api import create, add_archive_content ds = create(outdir) for f in '1.tar.gz', '2.tar.gz': with chpwd(outdir): ds.repo.add_url_to_file(f, topurl + f) ds.add(f) add_archive_content(f, delete=True, drop_after=True) files = glob.glob(op.join(outdir, '*')) ds.drop(files) # will not drop tarballs ds.repo.drop([], options=['--all', '--fast']) ds.get(files) ds.repo.drop([], options=['--all', '--fast']) # now loop so we could play with it outside print(outdir) # import pdb; pdb.set_trace() while True: sleep(0.1)
def check_observe_tqdm(topdir, topurl, outdir): # just a helper to enable/use when want quickly to get some # repository with archives and observe tqdm from datalad.api import create, add_archive_content ds = create(outdir) for f in '1.tar.gz', '2.tar.gz': with chpwd(outdir): ds.repo.add_url_to_file(f, topurl + f) ds.add(f) add_archive_content(f, delete=True, drop_after=True) files = glob.glob(op.join(outdir, '*')) ds.drop(files) # will not drop tarballs ds.repo.drop([], options=['--all', '--fast']) ds.get(files) ds.repo.drop([], options=['--all', '--fast']) # now loop so we could play with it outside print(outdir) # import pdb; pdb.set_trace() while True: sleep(0.1)
def test_add_archive_content_absolute_path(path=None): ds = Dataset(opj(path, "ds")).create(force=True) repo = ds.repo ds.save("1.tar.gz", message="1.tar.gz") abs_tar_gz = opj(path, "ds", "1.tar.gz") add_archive_content(abs_tar_gz, dataset=ds) ok_file_under_git(opj(path, "ds", "1", "foo"), annexed=True) commit_msg = repo.format_commit("%B") # The commit message uses relative paths. assert_not_in(abs_tar_gz, commit_msg) assert_in("1.tar.gz", commit_msg) res = add_archive_content(opj(path, "notds", "2.tar.gz"), dataset=ds, on_failure='ignore') assert_in_results( res, action='add-archive-content', status='impossible', message='Can not add archive outside of the dataset', )
def download_file(bucket, d, dataset_dir): link = bucket["links"]["self"] annex = Repo(dataset_dir).git.annex if "access_token" not in link: if bucket["type"] == "zip": d.download_url(link, archive=True if bucket["type"] == "zip" else False) else: try: # Try to addurl twice as rarely it might not work on the first try annex("addurl", link, "--fast", "--file", link.split("/")[-1]) except GitCommandError: annex("addurl", link, "--fast", "--file", link.split("/")[-1]) else: # Have to remove token from annex URL if bucket["type"] == "zip": file_path = d.download_url(link)[0]["path"] annex("rmurl", file_path, link) try: # Try to addurl twice as rarely it might not work on the first try annex("addurl", link.split("?")[0], "--file", file_path, "--relaxed") except GitCommandError: annex("addurl", link.split("?")[0], "--file", file_path, "--relaxed") api.add_archive_content(file_path, annex=AnnexRepo(dataset_dir), delete=True) else: file_name = json.load(annex("addurl", link, "--fast", "--json"))["file"] annex("rmurl", file_name, link) try: # Try to addurl twice as rarely it might not work on the first try annex("addurl", link.split("?")[0], "--file", file_name, "--relaxed") except GitCommandError: annex("addurl", link.split("?")[0], "--file", file_name, "--relaxed") d.save()
def __call__(urls, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.providers import Providers pwd, rel_pwd = get_dataset_pwds(dataset) ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='downloading urls') except NoDatasetArgumentFound: pass common_report = {"action": "download_url", "ds": ds} urls = assure_list_from_str(urls) if len(urls) > 1 and path and not op.isdir(path): yield get_status_dict( status="error", message=( "When specifying multiple urls, --path should point to " "an existing directory. Got %r", path), type="file", path=path, **common_report) return if dataset: # A dataset was explicitly given. path = op.normpath(op.join(ds.path, path or op.curdir)) elif save and ds: path = op.normpath(op.join(ds.path, rel_pwd, path or op.curdir)) elif not path: path = op.curdir # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} for url in urls: # somewhat "ugly" # providers.get_provider(url).get_downloader(url).download(url, path=path) # for now -- via sugaring try: downloaded_path = providers.download(url, path=path, overwrite=overwrite) except Exception as e: yield get_status_dict(status="error", message=exc_str(e), type="file", path=path, **common_report) else: downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in ds.rev_save(downloaded_paths, message=msg): yield r if isinstance(ds.repo, AnnexRepo): annex_paths = [ p for p, annexed in zip( downloaded_paths, ds.repo.is_under_annex(downloaded_paths)) if annexed ] if annex_paths: for path in annex_paths: try: # The file is already present. This is just to # register the URL. ds.repo.add_url_to_file(path, path_urls[path], batch=True) except AnnexBatchCommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, path_urls[path], exc_str(exc)) if archive: from datalad.api import add_archive_content for path in annex_paths: add_archive_content(path, annex=ds.repo, delete=True)
def __call__(dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset ) if dataset.repo.is_dirty(): raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename ) archive_out = next( export_archive( dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator" ) ) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article" ): article = figshare.create_article( title=os.path.basename(dataset.path) ) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article ) article_id = article['id'] else: article_id = int(ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())) )) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id ) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ] ) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit=False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format( file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr )
def __call__(urls, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.providers import Providers ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='downloading urls') except NoDatasetFound: pass common_report = {"action": "download_url", "ds": ds} got_ds_instance = isinstance(dataset, Dataset) dir_is_target = not path or path.endswith(op.sep) path = str(resolve_path(path or op.curdir, ds=dataset)) if dir_is_target: # resolve_path() doesn't preserve trailing separators. Add one for # the download() call. path = path + op.sep urls = ensure_list_from_str(urls) if not dir_is_target: if len(urls) > 1: yield get_status_dict( status="error", message= ("When specifying multiple urls, --path should point to " "a directory target (with a trailing separator). Got %r", path), type="file", path=path, **common_report) return if archive: # make sure the file suffix indicated by a URL is preserved # so that any further archive processing doesn't have to # employ mime type inspection in order to determine the archive # type from datalad.support.network import URL suffixes = PurePosixPath(URL(urls[0]).path).suffixes if not Path(path).suffixes == suffixes: path += ''.join(suffixes) # we know that we have a single URL # download() would be fine getting an existing directory and # downloading the URL underneath it, but let's enforce a trailing # slash here for consistency. if op.isdir(path): yield get_status_dict( status="error", message=( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding archive " "suffix) exists"), type="file", path=path, **common_report) return # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} for url in urls: # somewhat "ugly" # providers.get_provider(url).get_downloader(url).download(url, path=path) # for now -- via sugaring try: downloaded_path = providers.download(url, path=path, overwrite=overwrite) except Exception as e: yield get_status_dict(status="error", message=exc_str(e), type="file", path=path, **common_report) else: downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in Save()( downloaded_paths, message=msg, # ATTN: Pass the original dataset argument to # preserve relative path handling semantics. dataset=dataset, return_type="generator", result_xfm=None, result_filter=None, on_failure="ignore"): yield r if isinstance(ds.repo, AnnexRepo): if got_ds_instance: # Paths in `downloaded_paths` are already relative to the # dataset. rpaths = dict(zip(downloaded_paths, downloaded_paths)) else: # Paths in `downloaded_paths` are already relative to the # current working directory. Take these relative to the # dataset for use with the AnnexRepo method calls. rpaths = {} for orig_path, resolved in zip( downloaded_paths, resolve_path(downloaded_paths, ds=dataset)): rpath = path_under_rev_dataset(ds, resolved) if rpath: rpaths[str(rpath)] = orig_path else: lgr.warning("Path %s not under dataset %s", orig_path, ds) annex_paths = [ p for p, annexed in zip( rpaths, ds.repo.is_under_annex(list(rpaths.keys()))) if annexed ] if annex_paths: for path in annex_paths: url = path_urls[rpaths[path]] try: # The file is already present. This is just to # register the URL. ds.repo.add_url_to_file( path, url, # avoid batch mode for single files # https://github.com/datalad/datalad/issues/2849 batch=len(annex_paths) > 1, # bypass URL size check, we already have the file options=['--relaxed']) except CommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, url, exc_str(exc)) if archive: from datalad.api import add_archive_content for path in annex_paths: add_archive_content(path, annex=ds.repo, delete=True)
def test_add_archive_content(path_orig=None, url=None, repo_path=None): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API # no repo yet assert_raises(NoDatasetFound, add_archive_content, "nonexisting.tar.gz") ds = Dataset(repo_path).create() res = ds.add_archive_content("nonexisting.tar.gz", on_failure='ignore') assert_in_results(res, action='add-archive-content', status='impossible') repo = ds.repo # we can't add a file from outside the repo ATM res = ds.add_archive_content(Path(path_orig) / '1.tar.gz', on_failure='ignore') assert_in_results(res, action='add-archive-content', status='impossible', type="dataset", message="Can not add archive outside of the dataset") # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_url_to_file('1.tar.gz', opj(url, '1.tar.gz')) for s in range(1, 5): repo.add_url_to_file('%du/1.tar.gz' % s, opj(url, '%du/1.tar.gz' % s)) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_annexinfo('1.tar.gz')[ 'key'] # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex res = add_archive_content('1.tar.gz') assert_in_results(res, action='add-archive-content', status='ok') d1_basic_checks() # If ran again, should proceed just fine since the content is the same # so no changes would be made really res = add_archive_content('1.tar.gz') assert_in_results(res, action='add-archive-content', status='ok') # But that other one carries updated file, so should fail due to # overwrite res = add_archive_content(Path('1u') / '1.tar.gz', use_current_dir=True, on_failure='ignore') assert_in_results( res, action='add-archive-content', status='error', ) assert_in('exists, but would be overwritten by new file', res[0]['message']) # but should do fine if overrides are allowed add_archive_content(Path('1u') / '1.tar.gz', existing='overwrite', use_current_dir=True) add_archive_content(Path('2u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) add_archive_content(Path('3u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) add_archive_content(Path('4u') / '1.tar.gz', existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), dataset=ds.path, use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), dataset=ds.path, add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options ds.add_archive_content( '1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_url_to_file('d1/1.tar.gz', opj(url, 'd1', '1.tar.gz')) repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz', dataset=ds.path) d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=ds) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher ds.drop(opj('1', '1 f.txt')) # should be all kosher ds.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))
def __call__(urls, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.providers import Providers pwd, rel_pwd = get_dataset_pwds(dataset) ds = None if save or dataset: try: ds = require_dataset( dataset, check_installed=True, purpose='downloading urls') except NoDatasetArgumentFound: pass common_report = {"action": "download_url", "ds": ds} urls = assure_list_from_str(urls) if len(urls) > 1 and path and not op.isdir(path): yield get_status_dict( status="error", message=( "When specifying multiple urls, --path should point to " "an existing directory. Got %r", path), type="file", path=path, **common_report) return if dataset: # A dataset was explicitly given. path = op.normpath(op.join(ds.path, path or op.curdir)) elif save and ds: path = op.normpath(op.join(ds.path, rel_pwd, path or op.curdir)) elif not path: path = op.curdir # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} for url in urls: # somewhat "ugly" # providers.get_provider(url).get_downloader(url).download(url, path=path) # for now -- via sugaring try: downloaded_path = providers.download(url, path=path, overwrite=overwrite) except Exception as e: yield get_status_dict( status="error", message=exc_str(e), type="file", path=path, **common_report) else: downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict( status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in ds.add(downloaded_paths, message=msg): yield r if isinstance(ds.repo, AnnexRepo): annex_paths = [p for p, annexed in zip(downloaded_paths, ds.repo.is_under_annex(downloaded_paths)) if annexed] if annex_paths: for path in annex_paths: try: # The file is already present. This is just to # register the URL. ds.repo.add_url_to_file(path, path_urls[path], batch=True) except AnnexBatchCommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, path_urls[path], exc_str(exc)) if archive: from datalad.api import add_archive_content for path in annex_paths: add_archive_content(path, annex=ds.repo, delete=True)
def test_add_archive_content(path_orig, url, repo_path): with chpwd(repo_path): # TODO we need to be able to pass path into add_archive_content # We could mock but I mean for the API assert_raises(RuntimeError, add_archive_content, "nonexisting.tar.gz") # no repo yet repo = AnnexRepo(repo_path, create=True) assert_raises(ValueError, add_archive_content, "nonexisting.tar.gz") # we can't add a file from outside the repo ATM assert_raises(FileNotInRepositoryError, add_archive_content, opj(path_orig, '1.tar.gz')) # Let's add first archive to the repo so we could test with swallow_outputs(): repo.add_urls([opj(url, '1.tar.gz')], options=["--pathdepth", "-1"]) for s in range(1, 5): repo.add_urls([opj(url, '%du/1.tar.gz' % s)], options=["--pathdepth", "-2"]) repo.commit("added 1.tar.gz") key_1tar = repo.get_file_key( '1.tar.gz') # will be used in the test later def d1_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '1 f.txt', annexed=True) ok_file_under_git(opj('1', 'd', '1d'), annexed=True) ok_archives_caches(repo_path, 0) # and by default it just does it, everything goes to annex repo_ = add_archive_content('1.tar.gz') eq_(repo.path, repo_.path) d1_basic_checks() # If ran again, should proceed just fine since the content is the same so no changes would be made really add_archive_content('1.tar.gz') # But that other one carries updated file, so should fail due to overwrite with assert_raises(RuntimeError) as cme: add_archive_content(opj('1u', '1.tar.gz'), use_current_dir=True) # TODO: somewhat not precise since we have two possible "already exists" # -- in caching and overwrite check assert_in("already exists", str(cme.exception)) # but should do fine if overrides are allowed add_archive_content(opj('1u', '1.tar.gz'), existing='overwrite', use_current_dir=True) add_archive_content(opj('2u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('3u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) add_archive_content(opj('4u', '1.tar.gz'), existing='archive-suffix', use_current_dir=True) # rudimentary test assert_equal(sorted(map(basename, glob(opj(repo_path, '1', '1*')))), ['1 f-1.1.txt', '1 f-1.2.txt', '1 f-1.txt', '1 f.txt']) whereis = repo.whereis(glob(opj(repo_path, '1', '1*'))) # they all must be the same assert (all([x == whereis[0] for x in whereis[1:]])) # and we should be able to reference it while under subdirectory subdir = opj(repo_path, 'subdir') with chpwd(subdir, mkdir=True): add_archive_content(opj(pardir, '1.tar.gz'), use_current_dir=True) d1_basic_checks() # or we could keep relative path and also demand to keep the archive prefix # while extracting under original (annex root) dir add_archive_content(opj(pardir, '1.tar.gz'), add_archive_leading_dir=True) with chpwd(opj(repo_path, '1')): d1_basic_checks() with chpwd(repo_path): # test with excludes and renames and annex options add_archive_content('1.tar.gz', exclude=['d'], rename=['/ /_', '/^1/2'], annex_options="-c annex.largefiles=exclude=*.txt", delete=True) # no conflicts since new name ok_file_under_git('2', '1_f.txt', annexed=False) assert_false(exists(opj('2', 'd'))) assert_false(exists('1.tar.gz')) # delete was in effect # now test ability to extract within subdir with chpwd(opj(repo_path, 'd1'), mkdir=True): # Let's add first archive to the repo so we could test # named the same way but different content with swallow_outputs(): repo.add_urls([opj(url, 'd1', '1.tar.gz')], options=["--pathdepth", "-1"], cwd=getpwd()) # invoke under current subdir repo.commit("added 1.tar.gz in d1") def d2_basic_checks(): ok_(exists('1')) ok_file_under_git('1', '2 f.txt', annexed=True) ok_file_under_git(opj('1', 'd2', '2d'), annexed=True) ok_archives_caches(repo.path, 0) add_archive_content('1.tar.gz') d2_basic_checks() # in manual tests ran into the situation of inability to obtain on a single run # a file from an archive which was coming from a dropped key. I thought it was # tested in custom remote tests, but I guess not sufficiently well enough repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) ok_archives_caches(repo.path, 1, persistent=True) ok_archives_caches(repo.path, 0, persistent=False) repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.get(opj('1', '1 f.txt')) # that what managed to not work # TODO: check if persistent archive is there for the 1.tar.gz # We should be able to drop everything since available online with swallow_outputs(): clean(dataset=repo.path) repo.drop(key_1tar, key=True) # is available from the URL -- should be kosher repo.drop(opj('1', '1 f.txt')) # should be all kosher repo.get(opj('1', '1 f.txt')) # and should be able to get it again # bug was that dropping didn't work since archive was dropped first repo.call_annex(["drop", "--all"]) # verify that we can't drop a file if archive key was dropped and online archive was removed or changed size! ;) repo.get(key_1tar, key=True) unlink(opj(path_orig, '1.tar.gz')) with assert_raises(CommandError) as e: repo.drop(key_1tar, key=True) assert_equal(e.kwargs['stdout_json'][0]['success'], False) assert_result_values_cond( e.kwargs['stdout_json'], 'note', lambda x: '(Use --force to override this check, or adjust numcopies.)' in x) assert exists(opj(repo.path, repo.get_contentlocation(key_1tar)))