Ejemplo n.º 1
0
def fix_the_filename(data):
    from datalad.utils import file_basename
    download_ext = file_basename(data['filename'], return_ext=True)[-1]
    orig_filename, orig_ext = file_basename(data['target_filename'], return_ext=True)  # data['filename_orig']
    if orig_ext != download_ext:
        assert(download_ext == 'zip')  # we are not aware of other cases
        assert(orig_ext == 'scene')
        data = data.copy()
        # They have a bug that the same name of the archive provided for multiple .scene files
        # available within the study
        data['filename'] = orig_filename + '_scene' + '.' + download_ext
    yield data
Ejemplo n.º 2
0
def _datalad_export_plugin_call(dataset, output, argv=None):
    if argv:
        lgr.warn("tarball exporter ignores any additional options '{}'".format(
            argv))

    repo = dataset.repo
    committed_date = repo.get_committed_date()

    # could be used later on to filter files by some criterion
    def _filter_tarinfo(ti):
        # Reset the date to match the one of the last commit, not from the
        # filesystem since git doesn't track those at all
        # TODO: use the date of the last commit when any particular
        # file was changed -- would be the most kosher yoh thinks to the
        # degree of our abilities
        ti.mtime = committed_date
        return ti

    if output is None:
        output = "datalad_{}.tar.gz".format(dataset.id)
    else:
        if not output.endswith('.tar.gz'):
            output += '.tar.gz'

    root = dataset.path
    # use dir inside matching the output filename
    # TODO: could be an option to the export plugin allowing empty value
    # for no leading dir
    leading_dir = file_basename(output)

    # workaround for inability to pass down the time stamp
    with patch('time.time', return_value=committed_date), \
            tarfile.open(output, "w:gz") as tar:
        repo_files = sorted(repo.get_indexed_files())
        if isinstance(repo, AnnexRepo):
            annexed = repo.is_under_annex(
                repo_files, allow_quick=True, batch=True)
        else:
            annexed = [False] * len(repo_files)
        for i, rpath in enumerate(repo_files):
            fpath = opj(root, rpath)
            if annexed[i]:
                # resolve to possible link target
                link_target = os.readlink(fpath)
                if not isabs(link_target):
                    link_target = normpath(opj(dirname(fpath), link_target))
                fpath = link_target
            # name in the tarball
            aname = normpath(opj(leading_dir, rpath))
            tar.add(
                fpath,
                arcname=aname,
                recursive=False,
                filter=_filter_tarinfo)

    # I think it might better return "final" filename where stuff was saved
    return output
Ejemplo n.º 3
0
def test_file_basename():
    eq_(file_basename('1'), '1')
    eq_(file_basename('d1/1'), '1')
    eq_(file_basename('/d1/1'), '1')
    eq_(file_basename('1.'), '1.')
    eq_(file_basename('1.tar.gz'), '1')
    eq_(file_basename('1.Tar.gz'), '1')
    eq_(file_basename('1._bak.gz'), '1')
    eq_(file_basename('1.tar.gz', return_ext=True), ('1', 'tar.gz'))
    eq_(file_basename('/tmp/1.tar.gz'), '1')
    eq_(file_basename('/tmp/1.longish.gz'), '1.longish')
    eq_(file_basename('1_R1.1.1.tar.gz'), '1_R1.1.1')
    eq_(file_basename('ds202_R1.1.1.tgz'), 'ds202_R1.1.1')
Ejemplo n.º 4
0
    def __call__(dataset,
                 filename=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from unittest.mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo

        import logging
        lgr = logging.getLogger('datalad.local.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename,
                                 default_filename)  # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(repo_files,
                                              allow_quick=True,
                                              batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(repo_files,
                                                    allow_quick=True,
                                                    batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning
                             if missing_content == 'continue' else lgr.debug)(
                                 'File %s has no content available, skipped',
                                 fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' %
                                          fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(
                                opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(fpath,
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(status='ok',
                   path=filename,
                   type='file',
                   action='export_archive',
                   logger=lgr)
Ejemplo n.º 5
0
    def __call__(dataset, filename=None, archivetype='tar', compression='gz',
                 missing_content='error'):
        import os
        import tarfile
        import zipfile
        from mock import patch
        from os.path import join as opj, dirname, normpath, isabs
        import os.path as op

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo
        from datalad.dochelpers import exc_str

        import logging
        lgr = logging.getLogger('datalad.plugin.export_archive')

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti
        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype,
            '{}{}'.format(
                '.' if compression else '',
                compression) if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is None:
            filename = default_filename  # in current directory
        elif path.exists(filename) and path.isdir(filename):
            filename = path.join(filename, default_filename) # under given directory
        if not filename.endswith(file_extension):
            filename += file_extension

        root = dataset.path
        # use dir inside matching the output filename
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write
            repo_files = sorted(repo.get_indexed_files())
            if isinstance(repo, AnnexRepo):
                annexed = repo.is_under_annex(
                    repo_files, allow_quick=True, batch=True)
                # remember: returns False for files in Git!
                has_content = repo.file_has_content(
                    repo_files, allow_quick=True, batch=True)
            else:
                annexed = [False] * len(repo_files)
                has_content = [True] * len(repo_files)
            for i, rpath in enumerate(repo_files):
                fpath = opj(root, rpath)
                if annexed[i]:
                    if not has_content[i]:
                        if missing_content in ('ignore', 'continue'):
                            (lgr.warning if missing_content == 'continue' else lgr.debug)(
                                'File %s has no content available, skipped', fpath)
                            continue
                        else:
                            raise IOError('File %s has no content available' % fpath)

                    # resolve to possible link target
                    if op.islink(fpath):
                        link_target = os.readlink(fpath)
                        if not isabs(link_target):
                            link_target = normpath(opj(dirname(fpath), link_target))
                        fpath = link_target
                # name in the archive
                aname = normpath(opj(leading_dir, rpath))
                add_method(
                    fpath,
                    arcname=aname,
                    **(tar_args if archivetype == 'tar' else {}))

        if not isabs(filename):
            filename = opj(os.getcwd(), filename)

        yield dict(
            status='ok',
            path=filename,
            type='file',
            action='export_archive',
            logger=lgr)
Ejemplo n.º 6
0
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex
Ejemplo n.º 7
0
def dlplugin(dataset, output=None):
    import os
    import tarfile
    from mock import patch
    from os.path import join as opj, dirname, normpath, isabs
    from datalad.utils import file_basename
    from datalad.support.annexrepo import AnnexRepo

    import logging
    lgr = logging.getLogger('datalad.plugin.tarball')

    repo = dataset.repo
    committed_date = repo.get_committed_date()

    # could be used later on to filter files by some criterion
    def _filter_tarinfo(ti):
        # Reset the date to match the one of the last commit, not from the
        # filesystem since git doesn't track those at all
        # TODO: use the date of the last commit when any particular
        # file was changed -- would be the most kosher yoh thinks to the
        # degree of our abilities
        ti.mtime = committed_date
        return ti

    if output is None:
        output = "datalad_{}.tar.gz".format(dataset.id)
    else:
        if not output.endswith('.tar.gz'):
            output += '.tar.gz'

    root = dataset.path
    # use dir inside matching the output filename
    # TODO: could be an option to the export plugin allowing empty value
    # for no leading dir
    leading_dir = file_basename(output)

    # workaround for inability to pass down the time stamp
    with patch('time.time', return_value=committed_date), \
            tarfile.open(output, "w:gz") as tar:
        repo_files = sorted(repo.get_indexed_files())
        if isinstance(repo, AnnexRepo):
            annexed = repo.is_under_annex(repo_files,
                                          allow_quick=True,
                                          batch=True)
        else:
            annexed = [False] * len(repo_files)
        for i, rpath in enumerate(repo_files):
            fpath = opj(root, rpath)
            if annexed[i]:
                # resolve to possible link target
                link_target = os.readlink(fpath)
                if not isabs(link_target):
                    link_target = normpath(opj(dirname(fpath), link_target))
                fpath = link_target
            # name in the tarball
            aname = normpath(opj(leading_dir, rpath))
            tar.add(fpath,
                    arcname=aname,
                    recursive=False,
                    filter=_filter_tarinfo)

    if not isabs(output):
        output = opj(os.getcwd(), output)

    yield dict(status='ok',
               path=output,
               type='file',
               action='export_tarball',
               logger=lgr)
Ejemplo n.º 8
0
def dlplugin(dataset,
             filename=None,
             archivetype='tar',
             compression='gz',
             missing_content='error'):
    """Export the content of a dataset as a TAR/ZIP archive.

    Parameters
    ----------
    filename : str, optional
      File name of the generated TAR archive. If no file name is given
      the archive will be generated in the current directory and will
      be named: datalad_<dataset_uuid>.(tar.*|zip).
    archivetype : {'tar', 'zip'}
      Type of archive to generate.
    compression : {'', 'gz', 'bz2')
      Compression method to use. 'bz2' is not supported for ZIP archives.
    missing_content : {'error', 'continue', 'ignore'}, optional
      By default, any discovered file with missing content will result in
      an error and the plugin is aborted. Setting this to 'continue' will
      issue warnings instead of failing on error. The value 'ignore' will
      only inform about problem at the 'debug' log level. The latter two
      can be helpful when generating a TAR archive from a dataset where
      some file content is not available locally.

    """
    import os
    import tarfile
    import zipfile
    from mock import patch
    from os.path import join as opj, dirname, normpath, isabs
    from datalad.utils import file_basename
    from datalad.support.annexrepo import AnnexRepo
    from datalad.dochelpers import exc_str

    import logging
    lgr = logging.getLogger('datalad.plugin.export_archive')

    repo = dataset.repo
    committed_date = repo.get_committed_date()

    # could be used later on to filter files by some criterion
    def _filter_tarinfo(ti):
        # Reset the date to match the one of the last commit, not from the
        # filesystem since git doesn't track those at all
        # TODO: use the date of the last commit when any particular
        # file was changed -- would be the most kosher yoh thinks to the
        # degree of our abilities
        ti.mtime = committed_date
        return ti

    tar_args = dict(recursive=False, filter=_filter_tarinfo)

    file_extension = '.{}{}'.format(
        archivetype, '{}{}'.format('.' if compression else '', compression)
        if archivetype == 'tar' else '')

    if filename is None:
        filename = "datalad_{}".format(dataset.id)
    if not filename.endswith(file_extension):
        filename += file_extension

    root = dataset.path
    # use dir inside matching the output filename
    # TODO: could be an option to the export plugin allowing empty value
    # for no leading dir
    leading_dir = file_basename(filename)

    # workaround for inability to pass down the time stamp
    with patch('time.time', return_value=committed_date), \
            tarfile.open(filename, "w:{}".format(compression)) \
            if archivetype == 'tar' \
            else zipfile.ZipFile(
                filename, 'w',
                zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
            as archive:
        add_method = archive.add if archivetype == 'tar' else archive.write
        repo_files = sorted(repo.get_indexed_files())
        if isinstance(repo, AnnexRepo):
            annexed = repo.is_under_annex(repo_files,
                                          allow_quick=True,
                                          batch=True)
            # remember: returns False for files in Git!
            has_content = repo.file_has_content(repo_files,
                                                allow_quick=True,
                                                batch=True)
        else:
            annexed = [False] * len(repo_files)
            has_content = [True] * len(repo_files)
        for i, rpath in enumerate(repo_files):
            fpath = opj(root, rpath)
            if annexed[i]:
                if not has_content[i]:
                    if missing_content in ('ignore', 'continue'):
                        (lgr.warning
                         if missing_content == 'continue' else lgr.debug)(
                             'File %s has no content available, skipped',
                             fpath)
                        continue
                    else:
                        raise IOError('File %s has no content available',
                                      fpath)

                # resolve to possible link target
                link_target = os.readlink(fpath)
                if not isabs(link_target):
                    link_target = normpath(opj(dirname(fpath), link_target))
                fpath = link_target
            # name in the archive
            aname = normpath(opj(leading_dir, rpath))
            add_method(fpath,
                       arcname=aname,
                       **(tar_args if archivetype == 'tar' else {}))

    if not isabs(filename):
        filename = opj(os.getcwd(), filename)

    yield dict(status='ok',
               path=filename,
               type='file',
               action='export_archive',
               logger=lgr)
Ejemplo n.º 9
0
    def __call__(filename=None,
                 *,
                 dataset=None,
                 archivetype='tar',
                 compression='gz',
                 missing_content='error'):
        import tarfile
        import zipfile
        from unittest.mock import patch

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import file_basename
        from datalad.support.annexrepo import AnnexRepo

        import logging
        lgr = logging.getLogger('datalad.local.export_archive')

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export archive')

        repo = dataset.repo
        committed_date = repo.get_commit_date()

        # could be used later on to filter files by some criterion
        def _filter_tarinfo(ti):
            # Reset the date to match the one of the last commit, not from the
            # filesystem since git doesn't track those at all
            # TODO: use the date of the last commit when any particular
            # file was changed -- would be the most kosher yoh thinks to the
            # degree of our abilities
            ti.mtime = committed_date
            return ti

        tar_args = dict(recursive=False, filter=_filter_tarinfo)

        file_extension = '.{}{}'.format(
            archivetype, '{}{}'.format('.' if compression else '', compression)
            if archivetype == 'tar' else '')

        default_filename = "datalad_{.id}".format(dataset)
        if filename is not None:
            filename = Path(filename)
        if filename is None:
            filename = Path(default_filename)  # in current directory
        elif filename.exists() and filename.is_dir():
            filename = filename / default_filename  # under given directory
        if filename.suffix != file_extension:
            filename = filename.with_suffix(file_extension)

        root = dataset.path
        # use dir inside matching the output filename without suffix(es)
        # TODO: could be an option to the export plugin allowing empty value
        # for no leading dir
        leading_dir = file_basename(filename)

        # workaround for inability to pass down the time stamp
        with patch('time.time', return_value=committed_date), \
                tarfile.open(filename, "w:{}".format(compression)) \
                if archivetype == 'tar' \
                else zipfile.ZipFile(
                    filename, 'w',
                    zipfile.ZIP_STORED if not compression else zipfile.ZIP_DEFLATED) \
                as archive:
            add_method = archive.add if archivetype == 'tar' else archive.write

            repo_files = repo.get_content_info(ref='HEAD', untracked='no')
            if isinstance(repo, AnnexRepo):
                # add availability (has_content) info
                repo_files = repo.get_content_annexinfo(ref='HEAD',
                                                        init=repo_files,
                                                        eval_availability=True)
            for p, props in repo_files.items():
                if 'key' in props and not props.get('has_content', False):
                    if missing_content in ('ignore', 'continue'):
                        (lgr.warning
                         if missing_content == 'continue' else lgr.debug)(
                             'File %s has no content available, skipped', p)
                        continue
                    else:
                        raise IOError('File %s has no content available' % p)
                # name in the archive
                aname = Path(leading_dir) / p.relative_to(repo.pathobj)
                add_method(p if 'key' not in props else props['objloc'],
                           arcname=aname,
                           **(tar_args if archivetype == 'tar' else {}))

        yield dict(status='ok',
                   path=filename.resolve(),
                   type='file',
                   action='export_archive',
                   logger=lgr)