def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset info_fpath = opj(ds.path, agginfo_relpath) agg_base_path = dirname(info_fpath) agginfos = _load_json_object(info_fpath) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [ { 'metaprovider': sub, 'rpath': sub, 'type': 'dataset' } for sub in sorted(agginfos) # we already have the base dataset if (rpath == curdir and sub != curdir) or path_is_subpath(sub, rpath) ] to_query.extend(matching_subds) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) get(path=[ dict(path=opj(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of ], dataset=ds, result_renderer='disabled') for qap in to_query: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object(opj(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = normpath(opj(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset info_fpath = opj(ds.path, agginfo_relpath) agg_base_path = dirname(info_fpath) agginfos = _load_json_object(info_fpath) if not agginfos and not exists(info_fpath): # This dataset does not have aggregated metadata. Does it have any # other version? info_glob = agginfo_relpath_template.format('*') info_files = glob.glob(info_glob) msg = "Found no aggregated metadata info file %s." \ % info_fpath old_metadata_file = opj(ds.path, METADATA_DIR, METADATA_FILENAME) if exists(old_metadata_file): msg += " Found metadata generated with pre-0.10 version of " \ "DataLad, but it will not be used." upgrade_msg = "" if info_files: msg += " Found following info files, which might have been " \ "generated with newer version(s) of datalad: %s." \ % (', '.join(info_files)) upgrade_msg = ", upgrade datalad" msg += " You will likely need to either update the dataset from its " \ "original location,%s or reaggregate metadata locally." \ % upgrade_msg lgr.warning(msg) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [ { 'metaprovider': sub, 'rpath': sub, 'type': 'dataset' } for sub in sorted(agginfos) # we already have the base dataset if (rpath == curdir and sub != curdir) or path_is_subpath(sub, rpath) ] to_query.extend(matching_subds) to_query_available = [] for qap in to_query: if qap['metaprovider'] not in agginfos: res = get_status_dict( status='impossible', path=qap['path'], message= ('Dataset at %s contains no aggregated metadata on this path', qap['metaprovider']), ) res.update(res, **kwargs) if 'type' in qap: res['type'] = qap['type'] yield res else: to_query_available.append(qap) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query_available for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) # in case there was no metadata provider, we do not want to start # downloading everything: see https://github.com/datalad/datalad/issues/2458 objfiles.difference_update([None]) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: get(path=[ dict(path=opj(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of ], dataset=ds, result_renderer='disabled') for qap in to_query_available: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object(opj(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = normpath(opj(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset agginfos, agg_base_path = load_ds_aggregate_db(ds) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = op.relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = op.curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [{'metaprovider': sub, 'rpath': sub, 'type': 'dataset'} for sub in sorted(agginfos) # we already have the base dataset if (rpath == op.curdir and sub != op.curdir) or path_is_subpath(sub, rpath)] to_query.extend(matching_subds) to_query_available = [] for qap in to_query: if qap['metaprovider'] not in agginfos: res = get_status_dict( status='impossible', path=qap['path'], message=( 'Dataset at %s contains no aggregated metadata on this path', qap['metaprovider']), ) res.update(res, **kwargs) if 'type' in qap: res['type'] = qap['type'] yield res else: to_query_available.append(qap) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query_available for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) # in case there was no metadata provider, we do not want to start # downloading everything: see https://github.com/datalad/datalad/issues/2458 objfiles.difference_update([None]) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: get(path=[dict(path=op.join(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of], dataset=ds, result_renderer='disabled') for qap in to_query_available: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object( op.join(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = op.normpath(op.join(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])