Example #1
0
def _load_agginfo_db(ds_path):
    return {
        # paths in DB on disk are always relative
        # make absolute to ease processing during aggregation
        op.normpath(op.join(ds_path, p)): {
            k: op.normpath(op.join(ds_path, op.dirname(agginfo_relpath), v))
            if k in location_keys else v
            for k, v in props.items()
        }
        for p, props in _load_json_object(opj(ds_path,
                                              agginfo_relpath)).items()
    }
Example #2
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string is an error message, a list contains all absolute paths for
      all datasets on which info was put into the DB.
    """
    info_fpath = opj(ds_path, agginfo_relpath)
    info_basepath = dirname(info_fpath)
    # TODO cache these
    agginfos = _load_json_object(info_fpath)

    def _ensure_abs_obj_location(rec):
        # object location in the DB must be absolute so we can copy easily
        # to all relevant datasets
        for key in location_keys:
            if key in rec and not isabs(rec[key]):
                rec[key] = opj(info_basepath, rec[key])
        return rec

    rpath = relpath(path, start=ds_path)
    seed_ds = _get_containingds_from_agginfo(agginfos, rpath)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return ("No matching aggregated metadata in Dataset at %s", ds_path)

    # easy peasy
    seed_abs = opj(ds_path, seed_ds)
    db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds])
    hits = [seed_abs]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records an pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if agginfo_path.startswith(_with_sep(seed_ds)):
            absp = opj(ds_path, agginfo_path)
            db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path])
            hits.append(absp)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
Example #3
0
def _update_ds_agginfo(refds_path, ds_path, subds_paths, agginfo_db, to_save):
    """Perform metadata aggregation for ds and a given list of subdataset paths

    Parameters
    ----------
    refds_path : str
      Absolute path to the reference dataset that aggregate_metadata() was
      called on.
    ds_path : str
      Absolute path to the dataset to have its aggregate info updates
    subds_paths : list(str)
      Sequence of absolute paths of subdatasets of the to-be-updated dataset,
      whose agginfo shall be updated within the to-be-updated dataset
    agginfo_db : dict
      Dictionary with all information on aggregate metadata on all datasets.
      Keys are absolute paths of datasets.
    to_save : list
      List of paths to save eventually. This function will add new paths as
      necessary.
    """
    ds = Dataset(ds_path)
    # location info of aggregate metadata
    # aggregate.json
    agginfo_fpath = opj(ds.path, agginfo_relpath)
    # base path in which aggregate.json and objects is located
    agg_base_path = dirname(agginfo_fpath)
    # load existing aggregate info dict
    # TODO take from cache, once used in _get_dsinfo_from_aggmetadata()
    ds_agginfos = _load_json_object(agginfo_fpath)
    # object locations referenced initially
    objlocs_was = set(ai[k] for ai in ds_agginfos.values()
                      for k in location_keys if k in ai)
    # TODO look for datasets that are no longer registered and remove all
    # info about them

    # track which objects need to be copied
    objs2copy = []
    # for each subdataset (any depth level)
    for dpath in [ds_path] + subds_paths:
        # relative path of the currect dataset within the dataset we are updating
        drelpath = relpath(dpath, start=ds.path)
        # TODO figure out why `None` could be a value in the DB
        ## build aggregate info for the current subdataset
        #ds_dbinfo = agginfo_db.get(dpath, {})
        #if not ds_dbinfo:
        #    # we got nothing new, keep what we had
        #    continue
        #ds_dbinfo = ds_dbinfo.copy()
        ds_dbinfo = agginfo_db.get(dpath, {}).copy()
        for loclabel in location_keys:
            if loclabel == 'filepath_info' and drelpath == curdir:
                # do not write a file list into the dataset it is from
                if 'filepath_info' in ds_dbinfo:
                    del ds_dbinfo['filepath_info']
                continue
            # abspath to object
            objloc = ds_dbinfo.get(loclabel, None)
            if objloc is None:
                continue
            # XXX needs to change when layout of object store is changed
            # current is ./datalad/metadata/objects/{hash}/{hash}
            target_objrelpath = opj(*objloc.split(os.sep)[-3:])
            # make sure we copy the file from its current location to where it is
            # needed in this dataset
            target_objpath = opj(agg_base_path, target_objrelpath)
            objs2copy.append((objloc, target_objpath))
            # now build needed local relpath
            ds_dbinfo[loclabel] = target_objrelpath
        # (re)assign in case record is new
        ds_agginfos[drelpath] = ds_dbinfo
    # set of metadata objects now referenced
    objlocs_is = set(ai[k] for ai in ds_agginfos.values()
                     for k in location_keys if k in ai)
    objs2remove = objlocs_was.difference(objlocs_is)
    # TODO do we need to (double?) check if all object files exist?
    #objs2add = [o for o in objlocs_is if exists(opj(ds_path, o))]
    objs2add = objlocs_is

    # secretly remove obsolete object files, not really a result from a
    # user's perspective
    if objs2remove:
        ds.remove(
            [opj(agg_base_path, p) for p in objs2remove],
            # Don't use the misleading default commit message of `remove`:
            message='[DATALAD] Remove obsolete metadata object files',
            # we do not want to drop these files by default, because we would
            # loose them for other branches, and earlier tags
            # TODO evaluate whether this should be exposed as a switch
            # to run an explicit force-drop prior to calling remove()
            check=False,
            result_renderer=None,
            return_type=list)
        if not objs2add and not refds_path == ds_path:
            # this is not the base dataset, make sure to save removal in the
            # parentds -- not needed when objects get added, as removal itself
            # is already committed
            to_save(dict(path=ds_path, type='dataset', staged=True))

    # must copy object files to local target destination
    for copy_from, copy_to in objs2copy:
        if copy_to == copy_from:
            continue
        target_dir = dirname(copy_to)
        if not exists(target_dir):
            makedirs(target_dir)
        # TODO we could be more clever (later) and maybe `addurl` (or similar)
        # the file from another dataset
        if exists(copy_to):
            # no need to unlock, just wipe out and replace
            os.remove(copy_to)
        shutil.copy(copy_from, copy_to)
    # TODO is there any chance that this file could be gone at the end?
    #if exists(agginfo_fpath):
    to_save.append(dict(path=agginfo_fpath, type='file', staged=True))

    if objs2add:
        # they are added standard way, depending on the repo type
        ds.add([opj(agg_base_path, p) for p in objs2add],
               save=False,
               result_renderer=None,
               return_type=list)
        # queue for save, and mark as staged
        to_save.extend([
            dict(path=opj(agg_base_path, p), type='file', staged=True)
            for p in objs2add
        ])
    # write aggregate info file
    if not ds_agginfos:
        return

    json_py.dump(ds_agginfos, agginfo_fpath)
    ds.add(agginfo_fpath,
           save=False,
           to_git=True,
           result_renderer=None,
           return_type=list)
    # queue for save, and mark as staged
    to_save.append(dict(path=agginfo_fpath, type='file', staged=True))
Example #4
0
def _update_ds_agginfo(refds_path, ds_path, subds_paths, agginfo_db, to_save):
    """Perform metadata aggregation for ds and a given list of subdataset paths

    Parameters
    ----------
    refds_path : str
      Absolute path to the reference dataset that aggregate_metadata() was
      called on.
    ds_path : str
      Absolute path to the dataset to have its aggregate info updates
    subds_paths : list(str)
      Sequence of absolute paths of subdatasets of the to-be-updated dataset,
      whose agginfo shall be updated within the to-be-updated dataset.
      Any subdataset that is not listed here is assumed to be gone (i.e. no longer
      a subdataset at all, not just not locally installed)
    agginfo_db : dict
      Dictionary with all information on aggregate metadata on all datasets.
      Keys are absolute paths of datasets.
    to_save : list
      List of paths to save eventually. This function will add new paths as
      necessary.
    """
    ds = Dataset(ds_path)
    # location info of aggregate metadata
    # aggregate.json
    agginfo_fpath = opj(ds.path, agginfo_relpath)
    # base path in which aggregate.json and objects is located
    agg_base_path = dirname(agginfo_fpath)
    # load existing aggregate info dict
    # TODO take from cache, once used in _get_dsinfo_from_aggmetadata()
    ds_agginfos = _load_json_object(agginfo_fpath)
    # object locations referenced initially
    objlocs_was = set(ai[k]
                      for ai in ds_agginfos.values()
                      for k in location_keys
                      if k in ai)
    # track which objects need to be copied (each item is a from/to tuple
    objs2copy = []
    # for each subdataset (any depth level)
    procds_paths = [ds_path] + subds_paths
    for dpath in procds_paths:
        ds_dbinfo = agginfo_db.get(dpath, {}).copy()
        # relative path of the currect dataset within the dataset we are updating
        drelpath = relpath(dpath, start=ds.path)
        for loclabel in location_keys:
            # TODO filepath_info is obsolete
            if loclabel == 'filepath_info' and drelpath == curdir:
                # do not write a file list into the dataset it is from
                if 'filepath_info' in ds_dbinfo:
                    del ds_dbinfo['filepath_info']
                continue
            # abspath to object
            objloc = ds_dbinfo.get(loclabel, None)
            if objloc is None:
                continue
            # XXX needs to change when layout of object store is changed
            # current is ./datalad/metadata/objects/{hash}/{hash}
            target_objrelpath = opj(*objloc.split(os.sep)[-3:])
            # make sure we copy the file from its current location to where it is
            # needed in this dataset
            target_objpath = opj(agg_base_path, target_objrelpath)
            objs2copy.append((objloc, target_objpath))
            # now build needed local relpath
            ds_dbinfo[loclabel] = target_objrelpath
        # (re)assign in case record is new
        ds_agginfos[drelpath] = ds_dbinfo
    # remove all entries for which we did not (no longer) have a corresponding
    # subdataset to take care of
    ds_agginfos = {k: v
                   for k, v in ds_agginfos.items()
                   if normpath(opj(ds_path, k)) in procds_paths}
    # set of metadata objects now referenced
    objlocs_is = set(
        ai[k]
        for sdsrpath, ai in ds_agginfos.items()
        for k in location_keys
        if k in ai)
    objs2add = objlocs_is

    # yoh: we appanretly do need to filter the ones to remove - I did
    #      "git reset --hard HEAD^" and
    #      aggregate-metadata failed upon next run trying to remove
    #      an unknown to git file. I am yet to figure out why that
    #      mattered (hopefully not that reflog is used somehow)
    objs2remove = []
    for obj in objlocs_was.difference(objlocs_is):
        obj_path = opj(agg_base_path, obj)
        if lexists(obj_path):
            objs2remove.append(obj_path)
        else:
            # not really a warning, we don't need it anymore, it is already gone
            lgr.debug(
                "To-be-deleted metadata object not found, skip deletion (%s)",
                obj_path
            )

    # secretly remove obsolete object files, not really a result from a
    # user's perspective
    if objs2remove:
        ds.remove(
            objs2remove,
            # Don't use the misleading default commit message of `remove`:
            message='[DATALAD] Remove obsolete metadata object files',
            # we do not want to drop these files by default, because we would
            # loose them for other branches, and earlier tags
            # TODO evaluate whether this should be exposed as a switch
            # to run an explicit force-drop prior to calling remove()
            check=False,
            result_renderer=None, return_type=list)
        if not objs2add and not refds_path == ds_path:
            # this is not the base dataset, make sure to save removal in the
            # parentds -- not needed when objects get added, as removal itself
            # is already committed
            to_save.append(dict(path=ds_path, type='dataset', staged=True))

    # must copy object files to local target destination
    # make sure those objects are present
    ds.get([f for f, t in objs2copy], result_renderer='disabled')
    for copy_from, copy_to in objs2copy:
        if copy_to == copy_from:
            continue
        target_dir = dirname(copy_to)
        if not exists(target_dir):
            makedirs(target_dir)
        # TODO we could be more clever (later) and maybe `addurl` (or similar)
        # the file from another dataset
        if lexists(copy_to):
            # no need to unlock, just wipe out and replace
            os.remove(copy_to)
        shutil.copy(copy_from, copy_to)
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))

    if objs2add:
        # they are added standard way, depending on the repo type
        ds.add(
            [opj(agg_base_path, p) for p in objs2add],
            save=False, result_renderer=None, return_type=list)
        # queue for save, and mark as staged
        to_save.extend(
            [dict(path=opj(agg_base_path, p), type='file', staged=True)
             for p in objs2add])
    # write aggregate info file
    if not ds_agginfos:
        return

    json_py.dump(ds_agginfos, agginfo_fpath)
    ds.add(agginfo_fpath, save=False, to_git=True,
           result_renderer=None, return_type=list)
    # queue for save, and mark as staged
    to_save.append(
        dict(path=agginfo_fpath, type='file', staged=True))