def test_get_metadata_type(path): Dataset(path).create() # nothing set, nothing found assert_equal(get_metadata_type(Dataset(path)), []) # got section, but no setting open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n') assert_equal(get_metadata_type(Dataset(path)), []) # minimal setting open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n') assert_equal(get_metadata_type(Dataset(path)), 'mamboschwambo')
def test_get_metadata_type(path): Dataset(path).create() # nothing set, nothing found assert_equal(get_metadata_type(Dataset(path)), []) # got section, but no setting open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n') assert_equal(get_metadata_type(Dataset(path)), []) # minimal setting open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n') assert_equal(get_metadata_type(Dataset(path)), 'mamboschwambo')
def test_get_metadata_type(path=None): ds = Dataset(path).create() # nothing set, nothing found assert_equal(get_metadata_type(ds), []) # got section, but no setting open(opj(path, '.datalad', 'config'), 'w').write('[datalad "metadata"]\n') # not relying on automagical pick up of changes done # by external powers to the config # see https://github.com/datalad/datalad/issues/4363 for more info ds.config.reload() assert_equal(get_metadata_type(ds), []) # minimal setting open(opj(path, '.datalad', 'config'), 'w+').write('[datalad "metadata"]\nnativetype = mamboschwambo\n') ds.config.reload() assert_equal(get_metadata_type(ds), 'mamboschwambo')
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources, refcommit, subds_relpaths, agg_base_path): lgr.debug('Performing metadata extraction from %s', aggfrom_ds) # we will replace any conflicting info on this dataset with fresh stuff agginfo = db.get(aggfrom_ds.path, {}) # paths to extract from relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) # get extractors to engage from source dataset nativetypes = ['datalad_core', 'annex'] + ensure_list(get_metadata_type(aggfrom_ds)) # store essential extraction config in dataset record agginfo['extractors'] = nativetypes agginfo['datalad_version'] = datalad.__version__ # perform the actual extraction dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, nativetypes, # None indicates to honor a datasets per-extractor configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) meta = { 'ds': dsmeta, 'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta)) } # inject the info which commit we are describing into the core metadata # this is done here in order to avoid feeding it all the way down coremeta = dsmeta.get('datalad_core', {}) version = aggfrom_ds.repo.describe(commitish=refcommit) if version: coremeta['version'] = version coremeta['refcommit'] = refcommit dsmeta['datalad_core'] = coremeta # for both types of metadata for label, props in metasources.items(): dest = props['targetds'] if not meta[label]: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label, props['dumper']) # place metadata object into the source dataset objpath = op.join(dest.path, agg_base_path, objrelpath) # write obj files if op.exists(objpath): dest.unlock(objpath) elif op.lexists(objpath): # if it gets here, we have a symlink that is pointing nowhere # kill it, to be replaced with the newly aggregated content dest.repo.remove(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access props['dumper'](meta[label], objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(props['type'])] = objpath # overwrite existing info with stuff from just finished extraction db[aggfrom_ds.path] = agginfo return errored
def _extract_metadata(agginto_ds, aggfrom_ds, db, merge_native, to_save): """Dump metadata from a dataset into object in the metadata store of another Info on the metadata objects is placed into a DB dict under the absolute path of the dataset whose metadata was aggregated. Parameters ---------- agginto_ds : Dataset aggfrom_ds : Dataset db : dict merge_native : str Merge mode. """ subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list') # figure out a "state" of the dataset wrt its metadata that we are describing # 1. the latest commit that changed any file for which we could have native metadata refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths) objid = refcommit if refcommit else '' # 2, our own dataset-global metadata dsmetafile = opj(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json') if exists(dsmetafile): objid += md5(open(dsmetafile, 'r').read().encode()).hexdigest() # 3. potential annex-based metadata if isinstance(aggfrom_ds, AnnexRepo) and \ aggfrom_ds.config.obtain( 'datalad.metadata.aggregate-content-datalad-core', default=True, valtype=EnsureBool()): # if there is no annex metadata, this will come out empty, # hence hash would be same as for a plain GitRepo # and no, we cannot use the shasum of the annex branch, # because this will change even when no metadata has changed timestamps, _ = aggfrom_ds.repo._run_annex_command( 'metadata', '.', '-g', 'lastchanged') objid += timestamps.strip() if not objid: lgr.debug('%s has no metadata-relevant content', aggfrom_ds) else: lgr.debug('Dump metadata of %s (merge mode: %s) into %s', aggfrom_ds, merge_native, agginto_ds) agginfo = {} # dataset global if aggfrom_ds.id: agginfo['id'] = aggfrom_ds.id agginfo['refcommit'] = refcommit # put in DB db[aggfrom_ds.path] = agginfo if not objid: dsmeta = contentmeta = None # this is no error, there is simply no metadata whatsoever return False # if there is any chance for metadata # obtain metadata for dataset and content relevant_paths = sorted( _get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) nativetypes = get_metadata_type(aggfrom_ds) dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, # core must come first ['datalad_core'] + assure_list(nativetypes), merge_native, # None indicates to honor a datasets per-parser configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) # shorten to MD5sum objid = md5(objid.encode()).hexdigest() metasources = [('ds', 'dataset', dsmeta, aggfrom_ds, json_py.dump)] # do not store content metadata if either the source or the target dataset # do not want it if aggfrom_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()) or \ agginto_ds.config.obtain( 'datalad.metadata.store-aggregate-content', default=True, valtype=EnsureBool()): metasources.append(( 'cn', 'content', # sort by path key to get deterministic dump content (dict(contentmeta[k], path=k) for k in sorted(contentmeta)), aggfrom_ds, json_py.dump2xzstream)) # for both types of metadata for label, mtype, meta, dest, store in metasources: if not meta: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label) if store is json_py.dump2xzstream: objrelpath += '.xz' # place metadata object into the source dataset objpath = opj(dest.path, dirname(agginfo_relpath), objrelpath) # write obj files if exists(objpath): dest.unlock(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access store(meta, objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(mtype)] = objpath return errored
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources, refcommit, subds_relpaths, agg_base_path): lgr.debug('Performing metadata extraction from %s', aggfrom_ds) # we will replace any conflicting info on this dataset with fresh stuff agginfo = db.get(aggfrom_ds.path, {}) # paths to extract from relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths)) # get extractors to engage from source dataset nativetypes = ['datalad_core', 'annex'] + assure_list(get_metadata_type(aggfrom_ds)) # store esssential extraction config in dataset record agginfo['extractors'] = nativetypes agginfo['datalad_version'] = datalad.__version__ # perform the actual extraction dsmeta, contentmeta, errored = _get_metadata( aggfrom_ds, nativetypes, # None indicates to honor a datasets per-extractor configuration and to be # on by default global_meta=None, content_meta=None, paths=relevant_paths) meta = { 'ds': dsmeta, 'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta)) } # inject the info which commmit we are describing into the core metadata # this is done here in order to avoid feeding it all the way down coremeta = dsmeta.get('datalad_core', {}) version = aggfrom_ds.repo.describe(commitish=refcommit) if version: coremeta['version'] = version coremeta['refcommit'] = refcommit dsmeta['datalad_core'] = coremeta # for both types of metadata for label, props in metasources.items(): dest = props['targetds'] if not meta[label]: continue # only write to disk if there is something objrelpath = _get_obj_location(objid, label, props['dumper']) # place metadata object into the source dataset objpath = op.join(dest.path, agg_base_path, objrelpath) # write obj files if op.exists(objpath): dest.unlock(objpath) elif op.lexists(objpath): # if it gets here, we have a symlink that is pointing nowhere # kill it, to be replaced with the newly aggregated content dest.repo.remove(objpath) # TODO actually dump a compressed file when annexing is possible # to speed up on-demand access props['dumper'](meta[label], objpath) # stage for dataset.save() to_save.append(dict(path=objpath, type='file')) # important to use abspath here, needs to be rewritten relative to # all receiving datasets agginfo['{}_info'.format(props['type'])] = objpath # overwrite existing info with stuff from just finished extraction db[aggfrom_ds.path] = agginfo return errored