def test_aggregate_with_unavailable_objects_from_subds(path, target): base = Dataset(op.join(path, 'origin')).create(force=True) # force all metadata objects into the annex with open(op.join(base.path, '.datalad', '.gitattributes'), 'w') as f: f.write( '** annex.largefiles=nothing\nmetadata/objects/** annex.largefiles=anything\n' ) sub = base.create('sub', force=True) subsub = base.create(op.join('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) base.meta_aggregate(recursive=True, into='all') assert_repo_status(base.path) # now make that a subdataset of a new one, so aggregation needs to get the # metadata objects first: super = Dataset(target).create() super.install("base", source=base.path) assert_repo_status(super.path) clone = Dataset(op.join(super.path, "base")) assert_repo_status(clone.path) objpath = PurePosixPath('.datalad/metadata/objects') objs = [ o for o in sorted(clone.repo.get_annexed_files(with_content_only=False)) if objpath in PurePosixPath(o).parents ] eq_(len(objs), 6) eq_(all(clone.repo.file_has_content(objs)), False) # now aggregate should get those metadata objects super.meta_aggregate(recursive=True, into='all') eq_(all(clone.repo.file_has_content(objs)), True)
def _yield_dsmeta(ds): srcfiles, cfg_srcfiles = _get_dsmeta_srcfiles(ds) dsmeta = {} for srcfile in srcfiles: abssrcfile = ds.pathobj / PurePosixPath(srcfile) # TODO get annexed files, or do in a central place? if not abssrcfile.exists(): # nothing to load # warn if this was configured if srcfile in cfg_srcfiles: yield dict( path=ds.path, type='dataset', status='impossible', message=( 'configured custom metadata source is not ' 'available in %s: %s', ds, srcfile), ) # no further operation on half-broken metadata return lgr.debug('Load custom metadata from %s', abssrcfile) meta = jsonload(text_type(abssrcfile)) dsmeta.update(meta) if dsmeta: yield dict( path=ds.path, metadata=dsmeta, type='dataset', status='ok', )
def get_refcommit(ds): """Get most recent commit that changes any metadata-relevant content. This function should be executed in a clean dataset, with no uncommitted changes (untracked is OK). Returns ------- str or None None if there is no matching commit, a hexsha otherwise. """ exclude_paths = [ ds.repo.pathobj / PurePosixPath(e) for e in exclude_from_metadata ] count = 0 diff_cache = {} precommit = False while True: cur = 'HEAD~{:d}'.format(count) try: # get the diff between the next pair of previous commits diff = { p.relative_to(ds.repo.pathobj): props for p, props in iteritems(ds.repo.diffstatus( PRE_INIT_COMMIT_SHA if precommit else 'HEAD~{:d}'.format(count + 1), cur, # superfluous, but here to state the obvious untracked='no', # this should be OK, unit test covers the cases # of subdataset addition, modification and removal # refcommit evaluation only makes sense in a clean # dataset, and if that is true, any change in the # submodule record will be visible in the parent # already eval_submodule_state='no', # boost performance, we don't care about file types # here eval_file_type=False, _cache=diff_cache)) if props.get('state', None) != 'clean' \ and p not in exclude_paths \ and not any(e in p.parents for e in exclude_paths) } except ValueError as e: # likely ran out of commits to check if precommit: # end of things return None else: # one last round, taking in the entire history precommit = True continue if diff: return ds.repo.get_hexsha(cur) # next pair count += 1
def _get_fmeta_objpath(ds, expr, rec): fpath = Path(rec['path']) if rec.get('type', None) != 'file': # pragma: no cover # nothing else in here return # build associated metadata file path from POSIX # pieces and convert to platform conventions at the end return text_type(ds.pathobj / PurePosixPath( expr.format(freldir=fpath.relative_to(ds.pathobj).parent.as_posix(), fname=fpath.name)))
def annexjson2result(d, ds, **kwargs): """Helper to convert an annex JSON result to a datalad result dict Info from annex is rather heterogeneous, partly because some of it our support functions are faking. This helper should be extended with all needed special cases to homogenize the information. Parameters ---------- d : dict Annex info dict. ds : Dataset instance Used to determine absolute paths for `file` results. This dataset is not used to set `refds` in the result, pass this as a separate kwarg if needed. **kwargs Passes as-is to `get_status_dict`. Must not contain `refds`. """ lgr.debug('received JSON result from annex: %s', d) messages = [] res = get_status_dict(**kwargs) res['status'] = 'ok' if d.get('success', False) is True else 'error' # we cannot rely on any of these to be available as the feed from # git annex (or its wrapper) is not always homogeneous if d.get('file'): res['path'] = str(ds.pathobj / PurePosixPath(d['file'])) if 'command' in d: res['action'] = d['command'] if 'key' in d: res['annexkey'] = d['key'] if 'fields' in d: # this is annex metadata, filter out timestamps res['metadata'] = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in d['fields'].items() if not k.endswith('lastchanged') } if d.get('error-messages', None): res['error_message'] = '\n'.join(m.strip() for m in d['error-messages']) # avoid meaningless standard messages, and collision with actual error # messages elif 'note' in d: note = "; ".join( ln for ln in d['note'].splitlines() if ln != 'checksum...' and not ln.startswith('checking file')) if note: messages.append(translate_annex_notes.get(note, note)) if messages: res['message'] = '\n'.join(m.strip() for m in messages) return res
def _parse_gitmodules(ds): # TODO read .gitconfig from Git blob? gitmodules = ds.pathobj / '.gitmodules' if not gitmodules.exists(): return {} # pull out file content out, err = ds.repo._git_custom_command( '', ['git', 'config', '-z', '-l', '--file', '.gitmodules']) # abuse our config parser db, _ = _parse_gitconfig_dump(out, {}, None, True) mods = {} for k, v in iteritems(db): if not k.startswith('submodule.'): # we don't know what this is lgr.debug("Skip unrecognized .gitmodule specification: %s=%s", k, v) continue k_l = k.split('.') # module name is everything after 'submodule.' that is not the variable # name mod_name = '.'.join(k_l[1:-1]) mod = mods.get(mod_name, {}) # variable name is the last 'dot-free' segment in the key mod[k_l[-1]] = v mods[mod_name] = mod out = {} # bring into traditional shape for name, props in iteritems(mods): if 'path' not in props: lgr.debug("Failed to get '%s.path', skipping section", name) continue modprops = { 'gitmodule_{}'.format(k): v for k, v in iteritems(props) if not (k.startswith('__') or k == 'path') } modpath = ds.pathobj / PurePosixPath(props['path']) modprops['gitmodule_name'] = name out[modpath] = modprops return out
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset repo = ds.repo # OPT: .repo could be relatively expensive if not isinstance(repo, AnnexRepo): # nothing to be done return if process_type not in ('all', 'content'): return # no progress bar, we are only making a one-shot call to # annex, the rest is pretty much instantaneous # limit query to paths that are annexed query_paths = [ # go relative to minimize cmdline footprint of annex call text_type(Path(s['path']).relative_to(ds.pathobj)) for s in status # anything that looks like an annexed file if s.get('type', None) == 'file' \ and s.get('key', None) is not None ] log_progress( lgr.info, 'extractorannex', 'Start annex metadata extraction from %s', ds, total=len(query_paths), label='Annex metadata extraction', unit=' Files', ) for fpath, meta in repo.get_metadata( query_paths, # no timestamps, we are describing the status quo timestamps=False, # because we have filtered the query to only contained # annexed files, we can use batch mode and deal with # many files batch=True): log_progress(lgr.info, 'extractorannex', 'Extracted annex metadata from %s', fpath, update=1, increment=True) meta = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items() } if not meta: # only talk about files that actually carry metadata continue yield dict( # git annex reports the path in POSIX conventions path=PurePosixPath(fpath), metadata=meta, type='file', status='ok', ) log_progress( lgr.info, 'extractorannex', 'Finished annex metadata extraction from %s', ds, )
def postclonecfg_ria(ds, props): """Configure a dataset freshly cloned from a RIA store""" repo = ds.repo # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via # ssh) would make it see a bare repo and establish a hashdir lower annex # object tree. # Moreover, we want the ORA remote to receive all data for the store, so its # objects could be moved into archives (the main point of a RIA store). RIA_REMOTE_NAME = 'origin' # don't hardcode everywhere ds.config.set( 'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true', where='local') # chances are that if this dataset came from a RIA store, its subdatasets # may live there too. Place a subdataset source candidate config that makes # get probe this RIA store when obtaining subdatasets ds.config.set( # we use the label 'origin' for this candidate in order to not have to # generate a complicated name from the actual source specification. # we pick a cost of 200 to sort it before datalad's default candidates # for non-RIA URLs, because they prioritize hierarchical layouts that # cannot be found in a RIA store 'datalad.get.subdataset-source-candidate-200origin', # use the entire original URL, up to the fragment + plus dataset ID # placeholder, this should make things work with any store setup we # support (paths, ports, ...) props['source'].split('#', maxsplit=1)[0] + '#{id}', where='local') # setup publication dependency, if a corresponding special remote exists # and was enabled (there could be RIA stores that actually only have repos) # make this function be a generator ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype') == 'ora'] if not ora_remotes and any( r.get('externaltype') == 'ora' for r in (repo.get_special_remotes().values() if hasattr(repo, 'get_special_remotes') else [])): # no ORA remote autoenabled, but configuration known about at least one. # Let's check origin's config for datalad.ora-remote.uuid as stored by # create-sibling-ria and enable try enabling that one. lgr.debug("Found no autoenabled ORA special remote. Trying to look it " "up in source config ...") # First figure whether we cloned via SSH, HTTP or local path and then # get that config file the same way: config_content = None scheme = props['giturl'].split(':', 1)[0] if scheme in ['http', 'https']: try: config_content = download_url( "{}{}config".format( props['giturl'], '/' if not props['giturl'].endswith('/') else '')) except DownloadError as e: lgr.debug("Failed to get config file from source:\n%s", exc_str(e)) elif scheme == 'ssh': # TODO: switch the following to proper command abstraction: # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be # changed with command abstractions). So we need to get that part to # have a valid path to origin's config file: cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config' op = SSHRemoteIO(props['giturl']) try: config_content = op.read_file(cfg_path) except RIARemoteError as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) elif scheme == 'file': # TODO: switch the following to proper command abstraction: op = LocalIO() cfg_path = Path(URL(props['giturl']).localpath) / 'config' try: config_content = op.read_file(cfg_path) except (RIARemoteError, OSError) as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) else: lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or " "FILE scheme URLs.", scheme, props['source']) # 3. And read it org_uuid = None if config_content: # TODO: We might be able to spare the saving to a file. # "git config -f -" is not explicitly documented but happens # to work and would read from stdin. Make sure we know this # works for required git versions and on all platforms. with make_tempfile(content=config_content) as cfg_file: runner = GitWitlessRunner() try: result = runner.run( ['git', 'config', '-f', cfg_file, 'datalad.ora-remote.uuid'], protocol=StdOutCapture ) org_uuid = result['stdout'].strip() except CommandError as e: # doesn't contain what we are looking for lgr.debug("Found no UUID for ORA special remote at " "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e)) # Now, enable it. If annex-init didn't fail to enable it as stored, we # wouldn't end up here, so enable with store URL as suggested by the URL # we cloned from. if org_uuid: srs = repo.get_special_remotes() if org_uuid in srs.keys(): # TODO: - Double-check autoenable value and only do this when # true? # - What if still fails? -> Annex shouldn't change config # in that case # we only need the store: new_url = props['source'].split('#')[0] try: repo.enable_remote(srs[org_uuid]['name'], options=['url={}'.format(new_url)] ) lgr.info("Reconfigured %s for %s", srs[org_uuid]['name'], new_url) # update ora_remotes for considering publication dependency # below ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype', None) == 'ora'] except CommandError as e: lgr.debug("Failed to reconfigure ORA special remote: %s", exc_str(e)) else: lgr.debug("Unknown ORA special remote uuid at '%s': %s", RIA_REMOTE_NAME, org_uuid) if ora_remotes: if len(ora_remotes) == 1: yield from ds.siblings('configure', name=RIA_REMOTE_NAME, publish_depends=ora_remotes[0]['name'], result_filter=None, result_renderer='disabled') else: lgr.warning("Found multiple ORA remotes. Couldn't decide which " "publishing to 'origin' should depend on: %s. Consider " "running 'datalad siblings configure -s origin " "--publish-depends ORAREMOTENAME' to set publication " "dependency manually.", [r['name'] for r in ora_remotes])
def _get_contained_objs(ds): root = ds.pathobj / '.datalad' / 'metadata' / 'objects' return set(f for f in ds.repo.get_indexed_files() if root in (ds.pathobj / PurePosixPath(f)).parents)
def test_url_samples(): _check_ri("http://example.com", URL, scheme='http', hostname="example.com") # "complete" one for classical http _check_ri("http://*****:*****@example.com:8080/p/sp?p1=v1&p2=v2#frag", URL, scheme='http', hostname="example.com", port=8080, username='******', password='******', path='/p/sp', query='p1=v1&p2=v2', fragment='frag') # sample one for ssh with specifying the scheme # XXX? might be useful? https://github.com/FriendCode/giturlparse.py _check_ri("ssh://host/path/sp1", URL, scheme='ssh', hostname='host', path='/path/sp1') _check_ri("user@host:path/sp1", SSHRI, hostname='host', path='path/sp1', username='******') _check_ri("host:path/sp1", SSHRI, hostname='host', path='path/sp1') _check_ri("host:path", SSHRI, hostname='host', path='path') _check_ri("host:/path", SSHRI, hostname='host', path='/path') _check_ri("user@host", SSHRI, hostname='host', username='******') # TODO!!! should this be a legit URL like this? # _check_ri("host", SSHRI, hostname='host')) eq_(repr(RI("host:path")), "SSHRI(hostname='host', path='path')") # And now perspective 'datalad', implicit=True urls pointing to the canonical center location _check_ri("///", DataLadRI) _check_ri("///p/s1", DataLadRI, path='p/s1') # could be considered by someone as "URI reference" relative to scheme _check_ri("//a/", DataLadRI, remote='a') _check_ri("//a/data", DataLadRI, path='data', remote='a') # here we will do custom magic allowing only schemes with + in them, such as dl+archive # or not so custom as _check_ri("hg+https://host/user/proj", URL, scheme="hg+https", hostname='host', path='/user/proj') # "old" style _check_ri("dl+archive:KEY/path/sp1#size=123", URL, scheme='dl+archive', path='KEY/path/sp1', fragment='size=123') # "new" style _check_ri("dl+archive:KEY#path=path/sp1&size=123", URL, scheme='dl+archive', path='KEY', fragment='path=path/sp1&size=123') # actually above one is probably wrong since we need to encode the path _check_ri("dl+archive:KEY#path=path%2Fbsp1&size=123", URL, scheme='dl+archive', path='KEY', fragment='path=path%2Fbsp1&size=123') #https://en.wikipedia.org/wiki/File_URI_scheme _check_ri("file://host", URL, scheme='file', hostname='host') _check_ri("file://host/path/sp1", URL, scheme='file', hostname='host', path='/path/sp1') # stock libraries of Python aren't quite ready for ipv6 ipv6address = '2001:db8:85a3::8a2e:370:7334' _check_ri("file://%s/path/sp1" % ipv6address, URL, scheme='file', hostname=ipv6address, path='/path/sp1') for lh in ('localhost', '::1', '', '127.3.4.155'): _check_ri("file://%s/path/sp1" % lh, URL, localpath='/path/sp1', scheme='file', hostname=lh, path='/path/sp1') _check_ri('http://[1fff:0:a88:85a3::ac1f]:8001/index.html', URL, scheme='http', hostname='1fff:0:a88:85a3::ac1f', port=8001, path='/index.html') _check_ri("file:///path/sp1", URL, localpath='/path/sp1', scheme='file', path='/path/sp1') # we don't do any magical comprehension for home paths/drives for windows # of file:// urls, thus leaving /~ and /c: for now: _check_ri("file:///~/path/sp1", URL, localpath='/~/path/sp1', scheme='file', path='/~/path/sp1') _check_ri("file:///%7E/path/sp1", URL, localpath='/~/path/sp1', scheme='file', path='/~/path/sp1', exact_str=False) # not sure but let's check _check_ri("file:///c:/path/sp1", URL, localpath='/c:/path/sp1', scheme='file', path='/c:/path/sp1', exact_str=False) # and now implicit paths or actually they are also "URI references" _check_ri("f", PathRI, localpath='f', path='f') _check_ri("f/s1", PathRI, localpath='f/s1', path='f/s1') _check_ri(PurePosixPath("f"), PathRI, localpath='f', path='f') _check_ri(PurePosixPath("f/s1"), PathRI, localpath='f/s1', path='f/s1') # colons are problematic and might cause confusion into SSHRI _check_ri("f/s:1", PathRI, localpath='f/s:1', path='f/s:1') _check_ri("f/s:", PathRI, localpath='f/s:', path='f/s:') _check_ri("/f", PathRI, localpath='/f', path='/f') _check_ri("/f/s1", PathRI, localpath='/f/s1', path='/f/s1') # some github ones, just to make sure _check_ri("git://host/user/proj", URL, scheme="git", hostname="host", path="/user/proj") _check_ri("git@host:user/proj", SSHRI, hostname="host", path="user/proj", username='******') _check_ri('weired:/', SSHRI, hostname='weired', path='/') # since schema is not allowing some symbols so we need to add additional check _check_ri('weired_url:/', SSHRI, hostname='weired_url', path='/') _check_ri('example.com:/', SSHRI, hostname='example.com', path='/') _check_ri('example.com:path/sp1', SSHRI, hostname='example.com', path='path/sp1') _check_ri('example.com/path/sp1\:fname', PathRI, localpath='example.com/path/sp1\:fname', path='example.com/path/sp1\:fname') # ssh is as stupid as us, so we will stay "Consistently" dumb """ $> ssh example.com/path/sp1:fname ssh: Could not resolve hostname example.com/path/sp1:fname: Name or service not known edit 20190516 yoh: but this looks like a perfectly valid path. SSH knows that it is not a path but its SSHRI so it can stay dumb. We are trying to be smart and choose between RIs (even when we know that it is e.g. a file). """ _check_ri('e.com/p/sp:f', PathRI, localpath='e.com/p/sp:f', path='e.com/p/sp:f') _check_ri('[email protected]/mydir', PathRI, localpath='[email protected]/mydir', path='[email protected]/mydir') # SSHRIs have .port, but it is empty eq_(SSHRI(hostname='example.com').port, '') # check that we are getting a warning logged when url can't be reconstructed # precisely # actually failed to come up with one -- becomes late here #_check_ri("http://host///..//p", scheme='http', path='/..//p') # actually this one is good enough to trigger a warning and I still don't know # what it should exactly be!? with swallow_logs(new_level=logging.DEBUG) as cml: weired_str = 'weired://' weired_url = RI(weired_str) repr(weired_url) cml.assert_logged('Parsed version of SSHRI .weired:/. ' 'differs from original .weired://.') # but we store original str eq_(str(weired_url), weired_str) neq_(weired_url.as_str(), weired_str) raise SkipTest( "TODO: file://::1/some does complain about parsed version dropping ::1" )
def __call__(urls, *, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.http import HTTPDownloader from ..downloaders.providers import Providers ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='download urls') except NoDatasetFound: pass common_report = {"action": "download_url", "ds": ds} got_ds_instance = isinstance(dataset, Dataset) dir_is_target = not path or str(path).endswith(op.sep) path = str(resolve_path(path or op.curdir, ds=dataset)) if dir_is_target: # resolve_path() doesn't preserve trailing separators. Add one for # the download() call. path = path + op.sep urls = ensure_list_from_str(urls) if not dir_is_target: if len(urls) > 1: yield get_status_dict( status="error", message= ("When specifying multiple urls, --path should point to " "a directory target (with a trailing separator). Got %r", path), type="file", path=path, **common_report) return if archive: # make sure the file suffix indicated by a URL is preserved # so that any further archive processing doesn't have to # employ mime type inspection in order to determine the archive # type from datalad.support.network import URL suffixes = PurePosixPath(URL(urls[0]).path).suffixes if not Path(path).suffixes == suffixes: path += ''.join(suffixes) # we know that we have a single URL # download() would be fine getting an existing directory and # downloading the URL underneath it, but let's enforce a trailing # slash here for consistency. if op.isdir(path): yield get_status_dict( status="error", message=( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding archive " "suffix) exists"), type="file", path=path, **common_report) return # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} need_datalad_remote = False for url in urls: # somewhat "ugly" downloader = providers.get_provider(url).get_downloader(url) try: downloaded_path = downloader.download(url, path=path, overwrite=overwrite) except Exception as e: ce = CapturedException(e) yield get_status_dict(status="error", message=str(ce), type="file", path=path, exception=ce, **common_report) else: if not need_datalad_remote \ and (downloader.authenticator or downloader.credential or type(downloader) != HTTPDownloader): need_datalad_remote = True downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in Save()( downloaded_paths, message=msg, # ATTN: Pass the original dataset argument to # preserve relative path handling semantics. dataset=dataset, return_type="generator", result_renderer='disabled', result_xfm=None, result_filter=None, on_failure="ignore"): yield r ds_repo = ds.repo if isinstance(ds_repo, AnnexRepo): if need_datalad_remote: from datalad.customremotes.base import ( ensure_datalad_remote, ) ensure_datalad_remote(ds_repo, autoenable=True, encryption=None) if got_ds_instance: # Paths in `downloaded_paths` are already relative to the # dataset. rpaths = dict(zip(downloaded_paths, downloaded_paths)) else: # Paths in `downloaded_paths` are already relative to the # current working directory. Take these relative to the # dataset for use with the AnnexRepo method calls. rpaths = {} for orig_path, resolved in zip( downloaded_paths, resolve_path(downloaded_paths, ds=dataset)): rpath = path_under_rev_dataset(ds, resolved) if rpath: rpaths[str(rpath)] = orig_path else: lgr.warning("Path %s not under dataset %s", orig_path, ds) annex_paths = [ p for p, annexed in zip( rpaths, ds_repo.is_under_annex(list(rpaths.keys()))) if annexed ] if annex_paths: for path in annex_paths: url = path_urls[rpaths[path]] try: # The file is already present. This is just to # register the URL. ds_repo.add_url_to_file( path, url, # avoid batch mode for single files # https://github.com/datalad/datalad/issues/2849 batch=len(annex_paths) > 1, # bypass URL size check, we already have the file options=['--relaxed']) except CommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, url, CapturedException(exc)) if archive: for path in annex_paths: yield from ds.add_archive_content( path, delete=True, on_failure='ignore', return_type='generator', result_renderer='disabled')
def _p(rpath): return str(Path(PurePosixPath(rpath)))
def __call__(dataset=None, path=None, sources=None, process_type=None, format='native'): ds = require_dataset(dataset or curdir, purpose="extract metadata", check_installed=not path) # check what extractors we want as sources, and whether they are # available if not sources: sources = ['metalad_core', 'metalad_annex'] \ + assure_list(get_metadata_type(ds)) # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = {} for ep in iter_entry_points('datalad.metadata.extractors'): if ep.name not in sources: # not needed here continue rec = dict(entrypoint=ep) if ep.name in extractors: # pragma: no cover # potential conflict if extractors[ ep.name]['entrypoint'].dist.project_name == 'datalad': # this is OK, just state it is happening lgr.debug('Extractor %s overrides datalad-core variant', ep) extractors[ep.name] = rec elif ep.dist.project_name == 'datalad': # also OK lgr.debug('Prefer extractor %s over datalad-core variant', ep) else: msg = ('At least two DataLad extensions provide metadata ' 'extractor %s: %s vs. %s', ep.name, ep.dist, extractors[ep.name].dist) if ep.name in sources: # this extractor is required -> blow hard raise RuntimeError(msg[0] % msg[1:]) else: # still moan lgr.warn(msg) # ignore the newcomer, is listed second in sys.path else: # this fresh and unique extractors[ep.name] = rec for msrc in sources: if msrc not in extractors: # we said that we want to fail, rather then just moan about # less metadata raise ValueError( "Enabled metadata extractor '{}' not available".format( msrc), ) # load extractor implementation rec = extractors[msrc] rec['process_type'] = process_type \ if process_type and not process_type == 'extractors' \ else ds.config.obtain( 'datalad.metadata.extract-from-{}'.format( msrc.replace('_', '-')), default='all') # load the extractor class, no instantiation yet try: rec['class'] = rec['entrypoint'].load() except Exception as e: # pragma: no cover msg = ('Failed %s metadata extraction from %s: %s', msrc, ds, exc_str(e)) log_progress(lgr.error, 'metadataextractors', *msg) raise ValueError(msg[0] % msg[1:]) res_props = dict( action='meta_extract', logger=lgr, ) # build report on extractors and their state info if process_type == 'extractors': for ename, eprops in iteritems(extractors): state = {} # do not trip over old extractors if hasattr(eprops['class'], 'get_state'): state.update(eprops['class']().get_state(ds)) yield dict(action='meta_extract', path=ds.path, status='ok', logger=lgr, extractor=ename, state=dict( state, process_type=eprops['process_type'], )) return # build a representation of the dataset's content (incl subds # records) # go through a high-level command (not just the repo methods) to # get all the checks and sanitization of input arguments # this call is relatively expensive, but already anticipates # demand for information by our core extractors that always run # unconditionally, hence no real slowdown here # TODO this could be a dict, but MIH cannot think of an access # pattern that does not involve iteration over all items status = [] exclude_paths = [ ds.pathobj / PurePosixPath(e) for e in (list(exclude_from_metadata) + assure_list(ds.config.get('datalad.metadata.exclude-path', []))) ] if ds.is_installed(): # we can make use of status res_props.update(refds=ds.path) for r in ds.status( # let status sort out all path arg handling # but this will likely make it impossible to use this # command to just process an individual file independent # of a dataset path=path, # it is safe to ask for annex info even when a dataset is # plain Git # NOTE changing to 'annex=availability' has substantial # performance costs, as it involved resolving each annex # symlink on the file-system, which can be really slow # depending on the FS and the number of annexed files annex='basic', # TODO we never want to aggregate metadata from untracked # content, but we might just want to see what we can get # from a file untracked='no', # this command cannot and will not work recursively recursive=False, result_renderer='disabled'): # path reports are always absolute and anchored on the dataset # (no repo) path p = Path(r['path']) if p in exclude_paths or \ any(e in p.parents for e in exclude_paths): # this needs to be ignore for any further processing continue # strip useless context information status.append({ k: v for k, v in iteritems(r) if (k not in ('refds', 'parentds', 'action', 'status') and not k.startswith('prev_')) }) # determine the commit that we are describing refcommit = get_refcommit(ds) if refcommit is None or not len(status): # this seems extreme, but without a single commit there is # nothing we can have, or describe -> blow yield dict( res_props, status='error', message=\ 'No metadata-relevant repository content found. ' \ 'Cannot determine reference commit for metadata ID', type='dataset', path=ds.path, ) return # stamp every result res_props['refcommit'] = refcommit else: # no dataset at hand, take path arg at face value and hope # for the best # TODO we have to resolve the given path to make it match what # status is giving (abspath with ds (not repo) anchor) status = [dict(path=p, type='file') for p in assure_list(path)] # just for compatibility, mandatory argument list below refcommit = None if ds.is_installed(): # check availability requirements and obtain data as needed needed_paths = set() for rec in extractors.values(): if hasattr(rec['class'], 'get_required_content'): needed_paths.update( # new extractors do not need any instantiation args s['path'] for s in rec['class']().get_required_content( ds, rec['process_type'], status)) if needed_paths: for r in ds.get(path=needed_paths, return_type='generator', result_renderer='disabled'): if success_status_map.get( r['status'], False) != 'success': # pragma: no cover # online complain when something goes wrong yield r contexts = {} nodes_by_context = {} try: for res in _proc(ds, refcommit, sources, status, extractors, process_type): if format == 'native': # that is what we pass around internally res.update(**res_props) yield res elif format == 'jsonld': collect_jsonld_metadata(ds.pathobj, res, nodes_by_context, contexts) finally: # extractors can come from any source with no guarantee for # proper implementation. Let's make sure that we bring the # dataset back into a sane state (e.g. no batch processes # hanging around). We should do this here, as it is not # clear whether extraction results will be saved to the # dataset(which would have a similar sanitization effect) if ds.repo: ds.repo.precommit() if format == 'jsonld': yield dict(status='ok', type='dataset', path=ds.path, metadata=format_jsonld_metadata(nodes_by_context), **res_props)