Esempio n. 1
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string/tuple is an error message, a list contains all absolute paths
      for all datasets on which info was put into the DB.
    """
    # TODO cache these
    agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True)

    seed_ds = _get_containingds_from_agginfo(agginfos, path)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return (
            "No matching aggregated metadata for path '%s' in Dataset at %s",
            op.relpath(path, start=ds_path), ds_path)

    # easy peasy
    db[seed_ds] = agginfos[seed_ds]
    hits = [seed_ds]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records and pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if path_is_subpath(agginfo_path, seed_ds):
            db[agginfo_path] = agginfos[agginfo_path]
            hits.append(agginfo_path)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Esempio n. 2
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string/tuple is an error message, a list contains all absolute paths
      for all datasets on which info was put into the DB.
    """
    # TODO cache these
    agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True)

    seed_ds = _get_containingds_from_agginfo(agginfos, path)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return ("No matching aggregated metadata for path '%s' in Dataset at %s",
                op.relpath(path, start=ds_path), ds_path)

    # easy peasy
    db[seed_ds] = agginfos[seed_ds]
    hits = [seed_ds]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records and pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if path_is_subpath(agginfo_path, seed_ds):
            db[agginfo_path] = agginfos[agginfo_path]
            hits.append(agginfo_path)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Esempio n. 3
0
    def __call__(
            path=None,
            *,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = require_dataset(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = ensure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(
            ds,
            # do not warn here, next call triggers the same warning
            warn_absent=False)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        paths_by_ds, errors = get_paths_by_ds(
            require_dataset(dataset),
            dataset,
            paths=ensure_list(path),
            subdsroot_mode='super')
        for ap in _minimal_annotate_paths(
                paths_by_ds,
                errors,
                action='aggregate_metadata',
                recursive=recursive,
                recursion_limit=recursion_limit):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                # save does not need any pre-annotated path hints
                path=[r['path'] for r in to_save],
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_renderer='disabled',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Esempio n. 4
0
def _update_ds_agginfo(refds_path, ds_path, subds_paths, incremental, agginfo_db, to_save):
    """Perform metadata aggregation for ds and a given list of subdataset paths

    Parameters
    ----------
    refds_path : str
      Absolute path to the reference dataset that aggregate_metadata() was
      called on.
    ds_path : str
      Absolute path to the dataset to have its aggregate info updates
    subds_paths : list(str)
      Sequence of absolute paths of subdatasets of the to-be-updated dataset,
      whose agginfo shall be updated within the to-be-updated dataset.
      Any subdataset that is not listed here is assumed to be gone (i.e. no longer
      a subdataset at all, not just not locally installed)
    incremental : bool
      If set, the update will not remove any information on datasets not listed in
      subds_paths
    agginfo_db : dict
      Dictionary with all information on aggregate metadata on all datasets.
      Keys are absolute paths of datasets.
    to_save : list
      List of paths to save eventually. This function will add new paths as
      necessary.
    """
    ds = Dataset(ds_path)
    # load existing aggregate info dict
    # makes sure all file/dataset paths become absolute
    # TODO take from cache, once used in _get_dsinfo_from_aggmetadata()
    agginfo_fpath, agg_base_path = get_ds_aggregate_db_locations(ds)
    ds_agginfos = load_ds_aggregate_db(ds, abspath=True)
    # object locations referenced initially
    objlocs_was = set(ai[k]
                      for ai in ds_agginfos.values()
                      for k in location_keys
                      if k in ai)
    # track which objects need to be copied (each item is a from/to tuple
    objs2copy = []
    # for each subdataset (any depth level)
    procds_paths = [ds.path] + subds_paths
    for dpath in procds_paths:
        ds_dbinfo = agginfo_db.get(dpath, {}).copy()
        # relative path of the current dataset within the dataset we are updating
        drelpath = op.relpath(dpath, start=ds.path)
        for loclabel in location_keys:
            # TODO filepath_info is obsolete
            if loclabel == 'filepath_info' and drelpath == op.curdir:
                # do not write a file list into the dataset it is from
                if 'filepath_info' in ds_dbinfo:
                    del ds_dbinfo['filepath_info']
                continue
            # abspath to object
            objloc = ds_dbinfo.get(loclabel, None)
            if objloc is None:
                continue
            # XXX needs to change when layout of object store is changed
            # current is ./datalad/metadata/objects/{hash}/{hash}
            target_objpath = op.join(agg_base_path, *objloc.split(os.sep)[-3:])
            # make sure we copy the file from its current location to where it is
            # needed in this dataset
            objs2copy.append((
                # this needs to turn into an absolute path
                # `dpath` will be relative to the reference dataset
                #op.normpath(op.join(ds.path, dpath, op.dirname(agginfo_relpath), objloc)),
                objloc,
                target_objpath))
            # now build needed local relpath
            ds_dbinfo[loclabel] = target_objpath
        # (re)assign in case record is new
        ds_agginfos[dpath] = ds_dbinfo
    # remove all entries for which we did not (no longer) have a corresponding
    # subdataset to take care of
    if not incremental:
        ds_agginfos = {k: v
                       for k, v in ds_agginfos.items()
                       if k in procds_paths}
    # set of metadata objects now referenced
    objlocs_is = set(
        ai[k]
        for sdsrpath, ai in ds_agginfos.items()
        for k in location_keys
        if k in ai)
    objs2add = objlocs_is

    # yoh: we appanretly do need to filter the ones to remove - I did
    #      "git reset --hard HEAD^" and
    #      aggregate-metadata failed upon next run trying to remove
    #      an unknown to git file. I am yet to figure out why that
    #      mattered (hopefully not that reflog is used somehow)
    objs2remove = []
    for obj in objlocs_was.difference(objlocs_is):
        if op.lexists(obj):
            objs2remove.append(obj)
        else:
            # not really a warning, we don't need it anymore, it is already gone
            lgr.debug(
                "To-be-deleted metadata object not found, skip deletion (%s)",
                obj
            )

    # secretly remove obsolete object files, not really a result from a
    # user's perspective
    if not incremental and objs2remove:
        ds.remove(
            objs2remove,
            # Don't use the misleading default commit message of `remove`:
            message='[DATALAD] Remove obsolete metadata object files',
            # we do not want to drop these files by default, because we would
            # loose them for other branches, and earlier tags
            # TODO evaluate whether this should be exposed as a switch
            # to run an explicit force-drop prior to calling remove()
            reckless='kill',
            result_renderer='disabled',
            return_type='list')
        if not objs2add and not refds_path == ds_path:
            # this is not the base dataset, make sure to save removal in the
            # parentds -- not needed when objects get added, as removal itself
            # is already committed
            to_save.append(dict(path=ds_path, type='dataset', staged=True))

    objs2copy = [(f, t) for f, t in objs2copy if f != t]
    # must copy object files to local target destination
    # make sure those objects are present
    # use the reference dataset to resolve paths, as they might point to
    # any location in the dataset tree
    Dataset(refds_path).get(
        [f for f, t in objs2copy],
        result_renderer='disabled')
    for copy_from, copy_to in objs2copy:
        copy_from = op.join(agg_base_path, copy_from)
        copy_to = op.join(agg_base_path, copy_to)
        target_dir = op.dirname(copy_to)
        if not op.exists(target_dir):
            makedirs(target_dir)
        # TODO we could be more clever (later) and maybe `addurl` (or similar)
        # the file from another dataset
        if op.lexists(copy_to):
            # no need to unlock, just wipe out and replace
            os.remove(copy_to)
        shutil.copy(copy_from, copy_to)
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))

    if objs2add:
        # they are added standard way, depending on the repo type
        ds.repo.add([op.join(agg_base_path, p) for p in objs2add])
        # queue for save, and mark as staged
        to_save.extend(
            [dict(path=op.join(agg_base_path, p), type='file', staged=True)
             for p in objs2add])
    # write aggregate info file
    if not ds_agginfos:
        return

    _store_agginfo_db(ds, ds_agginfos)
    ds.repo.add(agginfo_fpath, git=True)
    # queue for save, and mark as staged
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))
Esempio n. 5
0
def _dump_extracted_metadata(agginto_ds, aggfrom_ds, db, to_save, force_extraction, agg_base_path):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata and the dataset config
    for tfile in (
            op.join(aggfrom_ds.path, DATASET_METADATA_FILE),
            op.join(aggfrom_ds.path, DATASET_CONFIG_FILE)):
        if op.exists(tfile):
            objid += md5(open(tfile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    # XXX TODO shouldn't this be the annex extractor?
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo.call_annex_oneline([
            'metadata',
            '.',
            '-g', 'lastchanged'])
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug(
            'Dump metadata of %s into %s',
            aggfrom_ds, agginto_ds)

    # check if we already have in store what we are about to create
    old_agginfo = db.get(aggfrom_ds.path, {})

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        # this is no error, there is simply no metadata whatsoever
        return False

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    # assemble info on the metadata extraction and storage
    #               label  type      targetds    storage method
    metasources = {'ds': {'type': 'dataset', 'targetds': agginto_ds, 'dumper': json_py.dump}}
    # do not store content metadata if either the source or the target dataset
    # do not want it
    # TODO this AND was an OR before (wrong), misses a test
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) and \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
            metasources['cn'] = {
                'type': 'content',
                'targetds': agginto_ds,
                'dumper': json_py.dump2xzstream}

    # check if we have the extracted metadata for this state already
    # either in the source or in the destination dataset
    # The situation is trickier!  Extracted metadata could change for the same
    # state (commit etc), e.g. if extractors changed.
    # The "correct" thing would be either
    # - to inspect git history either there were changes
    #   within aggfrom_ds since agginto_ds got the metadata committed OR
    # - check by content - if file is under git - compute checksum,
    #   if under annex -- take checksum from the key without asking for the
    #   content
    metafound = {}
    uptodatemeta = []  # record which meta not only found but matching in content
    # TODO: current fixes might break logic for when fromds is not installed
    #       when I guess we just need to skip it?
    if not force_extraction:
        for s, sprop in metasources.items():
            objloc = op.join(agg_base_path,
                             _get_obj_location(objid, s, sprop['dumper']))
            smetafound = [
                # important to test for lexists() as we do not need to
                # or want to `get()` metadata files for this test.
                # Info on identity is NOT sufficient - later compare content if
                # multiple found
                objloc if op.lexists(op.join(d.path, objloc)) else None
                # Order of dss matters later
                for d in (aggfrom_ds, agginto_ds)
            ]
            if all(smetafound):
                # both have it
                metafound[s] = smetafound
                # but are they the same?
                try:
                    if _the_same_across_datasets(objloc, aggfrom_ds, agginto_ds):
                        uptodatemeta.append(s)
                except RuntimeError as exc:
                    # TODO: dedicated test - when meta content changes
                    lgr.debug("For now will just do re-extraction since caught %s",
                              CapturedException(exc))
            # source one has it, so we might be able to copy it
            # TODO: dedicated test - when it is sufficient to copy we do not re-extract

    if len(metafound) != len(metasources):
        # found some (either ds or cn) metadata missing entirely in both
        # from and into datasets
        lgr.debug(
            "Incomplete or absent metadata while aggregating %s <- %s: %s",
              agginto_ds, aggfrom_ds, metafound
        )
        # no metadata found -> extract
        # this places metadata dump files into the configured
        # target dataset and lists them in `to_save`, as well
        # as updates the `db` record for `aggfrom_ds`
        return _extract_metadata(
            agginto_ds,
            aggfrom_ds,
            db,
            to_save,
            objid,
            metasources,
            refcommit,
            subds_relpaths,
            agg_base_path)

    # we did not actually run an extraction, so we need to
    # assemble an aggregation record from the existing pieces
    # that we found
    # simple case: the target dataset has all the records already and they are up to date:
    if len(uptodatemeta) == len(metasources):
        lgr.debug('Sticking with up-to-date metadata for %s', aggfrom_ds)
        # no change, use old record from the target dataset
        db[aggfrom_ds.path] = old_agginfo
        # no error
        return False
    else:
        lgr.debug('Reusing previously extracted metadata for %s', aggfrom_ds)
        # we need to move the metadata dump(s) into the target dataset
        objrelpaths = {
            label: next(filter(bool, smetafound))
            for label, smetafound in metafound.items()
        }
        # make sure all the to-be-moved metadata records are present
        # locally
        aggfrom_ds.get(
            path=[op.join(aggfrom_ds.path, p)
                  for p in objrelpaths.values()],
            result_renderer='disabled')

        # actually copy dump files
        for objrelpath in objrelpaths.values():
            objpath = op.join(agginto_ds.path, objrelpath)
            objdir = op.dirname(objpath)
            if not op.exists(objdir):
                makedirs(objdir)
            if op.lexists(objpath):
                os.unlink(objpath)  # remove previous version first
                # was a wild thought as a workaround for 
                # http://git-annex.branchable.com/bugs/cannot_commit___34__annex_add__34__ed_modified_file_which_switched_its_largefile_status_to_be_committed_to_git_now/#comment-bf70dd0071de1bfdae9fd4f736fd1ec1
                # agginto_ds.repo.remove(objpath)
            # XXX TODO once we have a command that can copy/move files
            # from one dataset to another including file availability
            # info, this should be used here
            shutil.copyfile(
                op.join(aggfrom_ds.path, objrelpath),
                objpath)
            # mark for saving
            to_save.append(dict(
                path=objpath,
                parentds=agginto_ds.path,
                type='file'))

        # lastly get 'self' aggregation record from source dataset and
        # use in target dataset
        db[aggfrom_ds.path] = load_ds_aggregate_db(aggfrom_ds, abspath=True)[aggfrom_ds.path]
        return False
Esempio n. 6
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = assure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Esempio n. 7
0
def _update_ds_agginfo(refds_path, ds_path, subds_paths, incremental, agginfo_db, to_save):
    """Perform metadata aggregation for ds and a given list of subdataset paths

    Parameters
    ----------
    refds_path : str
      Absolute path to the reference dataset that aggregate_metadata() was
      called on.
    ds_path : str
      Absolute path to the dataset to have its aggregate info updates
    subds_paths : list(str)
      Sequence of absolute paths of subdatasets of the to-be-updated dataset,
      whose agginfo shall be updated within the to-be-updated dataset.
      Any subdataset that is not listed here is assumed to be gone (i.e. no longer
      a subdataset at all, not just not locally installed)
    incremental : bool
      If set, the update will not remove any information on datasets not listed in
      subds_paths
    agginfo_db : dict
      Dictionary with all information on aggregate metadata on all datasets.
      Keys are absolute paths of datasets.
    to_save : list
      List of paths to save eventually. This function will add new paths as
      necessary.
    """
    ds = Dataset(ds_path)
    # load existing aggregate info dict
    # makes sure all file/dataset paths become absolute
    # TODO take from cache, once used in _get_dsinfo_from_aggmetadata()
    agginfo_fpath, agg_base_path = get_ds_aggregate_db_locations(ds)
    ds_agginfos = load_ds_aggregate_db(ds, abspath=True)
    # object locations referenced initially
    objlocs_was = set(ai[k]
                      for ai in ds_agginfos.values()
                      for k in location_keys
                      if k in ai)
    # track which objects need to be copied (each item is a from/to tuple
    objs2copy = []
    # for each subdataset (any depth level)
    procds_paths = [ds.path] + subds_paths
    for dpath in procds_paths:
        ds_dbinfo = agginfo_db.get(dpath, {}).copy()
        # relative path of the currect dataset within the dataset we are updating
        drelpath = op.relpath(dpath, start=ds.path)
        for loclabel in location_keys:
            # TODO filepath_info is obsolete
            if loclabel == 'filepath_info' and drelpath == op.curdir:
                # do not write a file list into the dataset it is from
                if 'filepath_info' in ds_dbinfo:
                    del ds_dbinfo['filepath_info']
                continue
            # abspath to object
            objloc = ds_dbinfo.get(loclabel, None)
            if objloc is None:
                continue
            # XXX needs to change when layout of object store is changed
            # current is ./datalad/metadata/objects/{hash}/{hash}
            target_objpath = op.join(agg_base_path, *objloc.split(os.sep)[-3:])
            # make sure we copy the file from its current location to where it is
            # needed in this dataset
            objs2copy.append((
                # this needs to turn into an absolute path
                # `dpath` will be relative to the reference dataset
                #op.normpath(op.join(ds.path, dpath, op.dirname(agginfo_relpath), objloc)),
                objloc,
                target_objpath))
            # now build needed local relpath
            ds_dbinfo[loclabel] = target_objpath
        # (re)assign in case record is new
        ds_agginfos[dpath] = ds_dbinfo
    # remove all entries for which we did not (no longer) have a corresponding
    # subdataset to take care of
    if not incremental:
        ds_agginfos = {k: v
                       for k, v in ds_agginfos.items()
                       if k in procds_paths}
    # set of metadata objects now referenced
    objlocs_is = set(
        ai[k]
        for sdsrpath, ai in ds_agginfos.items()
        for k in location_keys
        if k in ai)
    objs2add = objlocs_is

    # yoh: we appanretly do need to filter the ones to remove - I did
    #      "git reset --hard HEAD^" and
    #      aggregate-metadata failed upon next run trying to remove
    #      an unknown to git file. I am yet to figure out why that
    #      mattered (hopefully not that reflog is used somehow)
    objs2remove = []
    for obj in objlocs_was.difference(objlocs_is):
        if op.lexists(obj):
            objs2remove.append(obj)
        else:
            # not really a warning, we don't need it anymore, it is already gone
            lgr.debug(
                "To-be-deleted metadata object not found, skip deletion (%s)",
                obj
            )

    # secretly remove obsolete object files, not really a result from a
    # user's perspective
    if not incremental and objs2remove:
        ds.remove(
            objs2remove,
            # Don't use the misleading default commit message of `remove`:
            message='[DATALAD] Remove obsolete metadata object files',
            # we do not want to drop these files by default, because we would
            # loose them for other branches, and earlier tags
            # TODO evaluate whether this should be exposed as a switch
            # to run an explicit force-drop prior to calling remove()
            check=False,
            result_renderer=None, return_type=list)
        if not objs2add and not refds_path == ds_path:
            # this is not the base dataset, make sure to save removal in the
            # parentds -- not needed when objects get added, as removal itself
            # is already committed
            to_save.append(dict(path=ds_path, type='dataset', staged=True))

    objs2copy = [(f, t) for f, t in objs2copy if f != t]
    # must copy object files to local target destination
    # make sure those objects are present
    # use the reference dataset to resolve paths, as they might point to
    # any location in the dataset tree
    Dataset(refds_path).get(
        [f for f, t in objs2copy],
        result_renderer='disabled')
    for copy_from, copy_to in objs2copy:
        copy_from = op.join(agg_base_path, copy_from)
        copy_to = op.join(agg_base_path, copy_to)
        target_dir = op.dirname(copy_to)
        if not op.exists(target_dir):
            makedirs(target_dir)
        # TODO we could be more clever (later) and maybe `addurl` (or similar)
        # the file from another dataset
        if op.lexists(copy_to):
            # no need to unlock, just wipe out and replace
            os.remove(copy_to)
        shutil.copy(copy_from, copy_to)
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))

    if objs2add:
        # they are added standard way, depending on the repo type
        ds.add(
            [op.join(agg_base_path, p) for p in objs2add],
            save=False, result_renderer=None, return_type=list)
        # queue for save, and mark as staged
        to_save.extend(
            [dict(path=op.join(agg_base_path, p), type='file', staged=True)
             for p in objs2add])
    # write aggregate info file
    if not ds_agginfos:
        return

    _store_agginfo_db(ds, ds_agginfos)
    ds.add(agginfo_fpath, save=False, to_git=True,
           result_renderer=None, return_type=list)
    # queue for save, and mark as staged
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))
Esempio n. 8
0
def _dump_extracted_metadata(agginto_ds, aggfrom_ds, db, to_save, force_extraction, agg_base_path):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata and the dataset config
    for tfile in (
            op.join(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json'),
            op.join(aggfrom_ds.path, '.datalad', 'config')):
        if op.exists(tfile):
            objid += md5(open(tfile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    # XXX TODO shouldn't this be the annex extractor?
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo._run_annex_command(
            'metadata',
            '.',
            '-g', 'lastchanged')
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug(
            'Dump metadata of %s into %s',
            aggfrom_ds, agginto_ds)

    # check if we already have in store what we are about to create
    old_agginfo = db.get(aggfrom_ds.path, {})

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        # this is no error, there is simply no metadata whatsoever
        return False

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    # assemble info on the metadata extraction and storage
    #               label  type      targetds    storage method
    metasources = {'ds': {'type': 'dataset', 'targetds': agginto_ds, 'dumper': json_py.dump}}
    # do not store content metadata if either the source or the target dataset
    # do not want it
    # TODO this AND was an OR before (wrong), misses a test
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) and \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
            metasources['cn'] = {
                'type': 'content',
                'targetds': agginto_ds,
                'dumper': json_py.dump2xzstream}

    # check if we have the extracted metadata for this state already
    # either in the source or in the destination dataset
    # The situation is trickier!  Extracted metadata could change for the same
    # state (commit etc), e.g. if extractors changed.
    # The "correct" thing would be either
    # - to inspect git history either there were changes
    #   within aggfrom_ds since agginto_ds got the metadata committed OR
    # - check by content - if file is under git - compute checksum,
    #   if under annex -- take checksum from the key without asking for the
    #   content
    metafound = {}
    uptodatemeta = []  # record which meta not only found but matching in content
    # TODO: current fixes might break logic for when fromds is not installed
    #       when I guess we just need to skip it?
    if not force_extraction:
        for s, sprop in metasources.items():
            objloc = op.join(agg_base_path,
                             _get_obj_location(objid, s, sprop['dumper']))
            smetafound = [
                # important to test for lexists() as we do not need to
                # or want to `get()` metadata files for this test.
                # Info on identity is NOT sufficient - later compare content if
                # multiple found
                objloc if op.lexists(op.join(d.path, objloc)) else None
                # Order of dss matters later
                for d in (aggfrom_ds, agginto_ds)
            ]
            if all(smetafound):
                # both have it
                metafound[s] = smetafound
                # but are they the same?
                try:
                    if _the_same_across_datasets(objloc, aggfrom_ds, agginto_ds):
                        uptodatemeta.append(s)
                except RuntimeError as exc:
                    # TODO: dedicated test - when meta content changes
                    lgr.debug("For now will just do re-extraction since caught %s",
                              exc_str(exc))
            # source one has it, so we might be able to copy it
            # TODO: dedicated test - when it is sufficient to copy we do not re-extract

    if len(metafound) != len(metasources):
        # found some (either ds or cn) metadata missing entirely in both
        # from and into datasets
        lgr.debug(
            "Incomplete or absent metadata while aggregating %s <- %s: %s",
              agginto_ds, aggfrom_ds, metafound
        )
        # no metadata found -> extract
        # this places metadata dump files into the configured
        # target dataset and lists them in `to_save`, as well
        # as updates the `db` record for `aggfrom_ds`
        return _extract_metadata(
            agginto_ds,
            aggfrom_ds,
            db,
            to_save,
            objid,
            metasources,
            refcommit,
            subds_relpaths,
            agg_base_path)

    # we did not actually run an extraction, so we need to
    # assemble an aggregation record from the existing pieces
    # that we found
    # simple case: the target dataset has all the records already and they are up to date:
    if len(uptodatemeta) == len(metasources):
        lgr.debug('Sticking with up-to-date metadata for %s', aggfrom_ds)
        # no change, use old record from the target dataset
        db[aggfrom_ds.path] = old_agginfo
        # no error
        return False
    else:
        lgr.debug('Reusing previously extracted metadata for %s', aggfrom_ds)
        # we need to move the metadata dump(s) into the target dataset
        objrelpaths = {
            label: next(filter(bool, smetafound))
            for label, smetafound in metafound.items()
        }
        # make sure all the to-be-moved metadata records are present
        # locally
        aggfrom_ds.get(
            # prep annotated path records to speed up the call
            path=[dict(path=op.join(aggfrom_ds.path, p),
                       parentds=aggfrom_ds.path,
                       type='file')
                  for p in objrelpaths.values()],
            result_renderer='disabled')

        # actually copy dump files
        for objrelpath in objrelpaths.values():
            objpath = op.join(agginto_ds.path, objrelpath)
            objdir = op.dirname(objpath)
            if not op.exists(objdir):
                makedirs(objdir)
            if op.lexists(objpath):
                os.unlink(objpath)  # remove previous version first
                # was a wild thought as a workaround for 
                # http://git-annex.branchable.com/bugs/cannot_commit___34__annex_add__34__ed_modified_file_which_switched_its_largefile_status_to_be_committed_to_git_now/#comment-bf70dd0071de1bfdae9fd4f736fd1ec1
                # agginto_ds.repo.remove(objpath)
            # XXX TODO once we have a command that can copy/move files
            # from one dataset to another including file availability
            # info, this should be used here
            shutil.copyfile(
                op.join(aggfrom_ds.path, objrelpath),
                objpath)
            # mark for saving
            to_save.append(dict(
                path=objpath,
                parentds=agginto_ds.path,
                type='file'))

        # lastly get 'self' aggregation record from source dataset and
        # use in target dataset
        db[aggfrom_ds.path] = load_ds_aggregate_db(aggfrom_ds, abspath=True)[aggfrom_ds.path]
        return False