def fix_the_filename(data): from datalad.utils import file_basename download_ext = file_basename(data['filename'], return_ext=True)[-1] orig_filename, orig_ext = file_basename(data['target_filename'], return_ext=True) # data['filename_orig'] if orig_ext != download_ext: assert(download_ext == 'zip') # we are not aware of other cases assert(orig_ext == 'scene') data = data.copy() # They have a bug that the same name of the archive provided for multiple .scene files # available within the study data['filename'] = orig_filename + '_scene' + '.' + download_ext yield data
def _datalad_export_plugin_call(dataset, output, argv=None): if argv: lgr.warn("tarball exporter ignores any additional options '{}'".format( argv)) repo = dataset.repo committed_date = repo.get_committed_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti if output is None: output = "datalad_{}.tar.gz".format(dataset.id) else: if not output.endswith('.tar.gz'): output += '.tar.gz' root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(output) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(output, "w:gz") as tar: repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex( repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: # resolve to possible link target link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the tarball aname = normpath(opj(leading_dir, rpath)) tar.add( fpath, arcname=aname, recursive=False, filter=_filter_tarinfo) # I think it might better return "final" filename where stuff was saved return output
def test_file_basename(): eq_(file_basename('1'), '1') eq_(file_basename('d1/1'), '1') eq_(file_basename('/d1/1'), '1') eq_(file_basename('1.'), '1.') eq_(file_basename('1.tar.gz'), '1') eq_(file_basename('1.Tar.gz'), '1') eq_(file_basename('1._bak.gz'), '1') eq_(file_basename('1.tar.gz', return_ext=True), ('1', 'tar.gz')) eq_(file_basename('/tmp/1.tar.gz'), '1') eq_(file_basename('/tmp/1.longish.gz'), '1.longish') eq_(file_basename('1_R1.1.1.tar.gz'), '1_R1.1.1') eq_(file_basename('ds202_R1.1.1.tgz'), 'ds202_R1.1.1')
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from unittest.mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath( opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def __call__(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): import os import tarfile import zipfile from mock import patch from os.path import join as opj, dirname, normpath, isabs import os.path as op from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format( '.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is None: filename = default_filename # in current directory elif path.exists(filename) and path.isdir(filename): filename = path.join(filename, default_filename) # under given directory if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex( repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content( repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available' % fpath) # resolve to possible link target if op.islink(fpath): link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method( fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict( status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
def dlplugin(dataset, output=None): import os import tarfile from mock import patch from os.path import join as opj, dirname, normpath, isabs from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.plugin.tarball') repo = dataset.repo committed_date = repo.get_committed_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti if output is None: output = "datalad_{}.tar.gz".format(dataset.id) else: if not output.endswith('.tar.gz'): output += '.tar.gz' root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(output) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(output, "w:gz") as tar: repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: # resolve to possible link target link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the tarball aname = normpath(opj(leading_dir, rpath)) tar.add(fpath, arcname=aname, recursive=False, filter=_filter_tarinfo) if not isabs(output): output = opj(os.getcwd(), output) yield dict(status='ok', path=output, type='file', action='export_tarball', logger=lgr)
def dlplugin(dataset, filename=None, archivetype='tar', compression='gz', missing_content='error'): """Export the content of a dataset as a TAR/ZIP archive. Parameters ---------- filename : str, optional File name of the generated TAR archive. If no file name is given the archive will be generated in the current directory and will be named: datalad_<dataset_uuid>.(tar.*|zip). archivetype : {'tar', 'zip'} Type of archive to generate. compression : {'', 'gz', 'bz2') Compression method to use. 'bz2' is not supported for ZIP archives. missing_content : {'error', 'continue', 'ignore'}, optional By default, any discovered file with missing content will result in an error and the plugin is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally. """ import os import tarfile import zipfile from mock import patch from os.path import join as opj, dirname, normpath, isabs from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.plugin.export_archive') repo = dataset.repo committed_date = repo.get_committed_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') if filename is None: filename = "datalad_{}".format(dataset.id) if not filename.endswith(file_extension): filename += file_extension root = dataset.path # use dir inside matching the output filename # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = sorted(repo.get_indexed_files()) if isinstance(repo, AnnexRepo): annexed = repo.is_under_annex(repo_files, allow_quick=True, batch=True) # remember: returns False for files in Git! has_content = repo.file_has_content(repo_files, allow_quick=True, batch=True) else: annexed = [False] * len(repo_files) has_content = [True] * len(repo_files) for i, rpath in enumerate(repo_files): fpath = opj(root, rpath) if annexed[i]: if not has_content[i]: if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', fpath) continue else: raise IOError('File %s has no content available', fpath) # resolve to possible link target link_target = os.readlink(fpath) if not isabs(link_target): link_target = normpath(opj(dirname(fpath), link_target)) fpath = link_target # name in the archive aname = normpath(opj(leading_dir, rpath)) add_method(fpath, arcname=aname, **(tar_args if archivetype == 'tar' else {})) if not isabs(filename): filename = opj(os.getcwd(), filename) yield dict(status='ok', path=filename, type='file', action='export_archive', logger=lgr)
def __call__(filename=None, *, dataset=None, archivetype='tar', compression='gz', missing_content='error'): import tarfile import zipfile from unittest.mock import patch from datalad.distribution.dataset import require_dataset from datalad.utils import file_basename from datalad.support.annexrepo import AnnexRepo import logging lgr = logging.getLogger('datalad.local.export_archive') dataset = require_dataset(dataset, check_installed=True, purpose='export archive') repo = dataset.repo committed_date = repo.get_commit_date() # could be used later on to filter files by some criterion def _filter_tarinfo(ti): # Reset the date to match the one of the last commit, not from the # filesystem since git doesn't track those at all # TODO: use the date of the last commit when any particular # file was changed -- would be the most kosher yoh thinks to the # degree of our abilities ti.mtime = committed_date return ti tar_args = dict(recursive=False, filter=_filter_tarinfo) file_extension = '.{}{}'.format( archivetype, '{}{}'.format('.' if compression else '', compression) if archivetype == 'tar' else '') default_filename = "datalad_{.id}".format(dataset) if filename is not None: filename = Path(filename) if filename is None: filename = Path(default_filename) # in current directory elif filename.exists() and filename.is_dir(): filename = filename / default_filename # under given directory if filename.suffix != file_extension: filename = filename.with_suffix(file_extension) root = dataset.path # use dir inside matching the output filename without suffix(es) # TODO: could be an option to the export plugin allowing empty value # for no leading dir leading_dir = file_basename(filename) # workaround for inability to pass down the time stamp with patch('time.time', return_value=committed_date), \ tarfile.open(filename, "w:{}".format(compression)) \ if archivetype == 'tar' \ else zipfile.ZipFile( filename, 'w', zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \ as archive: add_method = archive.add if archivetype == 'tar' else archive.write repo_files = repo.get_content_info(ref='HEAD', untracked='no') if isinstance(repo, AnnexRepo): # add availability (has_content) info repo_files = repo.get_content_annexinfo(ref='HEAD', init=repo_files, eval_availability=True) for p, props in repo_files.items(): if 'key' in props and not props.get('has_content', False): if missing_content in ('ignore', 'continue'): (lgr.warning if missing_content == 'continue' else lgr.debug)( 'File %s has no content available, skipped', p) continue else: raise IOError('File %s has no content available' % p) # name in the archive aname = Path(leading_dir) / p.relative_to(repo.pathobj) add_method(p if 'key' not in props else props['objloc'], arcname=aname, **(tar_args if archivetype == 'tar' else {})) yield dict(status='ok', path=filename.resolve(), type='file', action='export_archive', logger=lgr)