def add_name_fixing(path): import datalad.support.json_py as json_py spec = [r for r in json_py.load_stream(path)] # Note: We append the procedure to dicomseries:all, since we do not # technically act upon a particular series. This is because the procedure # concerns the outcome of the conversion, not the raw data. The file # selection has to be done within the procedure and can't be controlled by # the spec or hirni-spec2bids ATM. for snippet in spec: if snippet['type'] == 'dicomseries:all': snippet['procedures'].append({ 'procedure-name': { 'value': 'change-dwi-run-to-acq_fix_all', 'approved': True }, 'procedure-name': { 'value': 'fieldmaps-to-phase-or-magnitude_fix_all', 'approved': True }, 'on-anonymize': { 'value': False, 'approved': True }, }) json_py.dump2stream(spec, path)
def put(self, path=None): if self.read_only: abort(403) args = self.rp.parse_args() path = path or args.path if path is None or args.content is None: # BadRequest abort(400) file_abspath = self._validate_file_path( path, fail_nonexistent=False) # TODO handle failure without crashing if op.exists(file_abspath): self.ds.repo.remove(file_abspath) # TODO git checkout of that removed files, when # below fails # TODO support file uploads dirname = op.dirname(file_abspath) if not op.exists(dirname): os.makedirs(dirname) if args.json == 'stream': json_py.dump2stream( json_py.loads(args.content), file_abspath) elif args.json == 'yes': json_py.dump( json_py.loads(args.content), file_abspath) else: open(file_abspath, 'w').write(args.content) self.ds.save( file_abspath, to_git=args.togit, #message="", )
def _create_record(run_info, sidecar_flag, ds): """ Returns ------- str or None, str or None The first value is either the full run record in JSON serialzied form, or content-based ID hash, if the record was written to a file. In that latter case, the second value is the path to the record sidecar file, or None otherwise. """ record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar_flag is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar_flag record_id = None record_path = None if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() # nosec record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = ds.pathobj / record_dir / record_id if not op.lexists(record_path): # go for compression, even for minimal records not much difference, # despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) return record_id or record, record_path
def test_dump2stream(path): stream = [dict(a=5), dict(b=4)] dump2stream([dict(a=5), dict(b=4)], path) eq_(list(load_stream(path)), stream) # the same for compression dump2xzstream([dict(a=5), dict(b=4)], path) eq_(list(load_xzstream(path)), stream)
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, message=None, rerun_info=None, rerun_outputs=None, sidecar=None): rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path # delayed imports from datalad.cmd import Runner lgr.debug('tracking command output underneath %s', ds) if not rerun_info and ds.repo.dirty: # Rerun already takes care of this. yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) if inputs: for res in ds.get(inputs.expand(full=True), on_failure="ignore"): yield res outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"], warn=not rerun_info) if outputs: for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res sfmt = SequenceFormatter() cmd_expanded = sfmt.format(cmd, pwd=pwd, dspath=ds.path, inputs=inputs.expand(dot=False), outputs=outputs.expand(dot=False)) # we have a clean dataset, let's run things exc = None cmd_exitcode = None runner = Runner(cwd=pwd) try: lgr.info("== Command start (output follows) =====") runner.run( cmd_expanded, # immediate output log_online=True, # not yet sure what we should do with the command output # IMHO `run` itself should be very silent and let the command talk log_stdout=False, log_stderr=False, expect_stderr=True, expect_fail=True, # TODO stdin ) except CommandError as e: # strip our own info from the exception. The original command output # went to stdout/err -- we just have to exitcode in the same way exc = e cmd_exitcode = e.code if rerun_info and rerun_info.get("exit", 0) != cmd_exitcode: # we failed in a different way during a rerun. This can easily # happen if we try to alter a locked file # # TODO add the ability to `git reset --hard` the dataset tree on failure # we know that we started clean, so we could easily go back, needs gh-1424 # to be able to do it recursively raise exc lgr.info("== Command exit (modification check follows) =====") # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode if cmd_exitcode is not None else 0, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) use_sidecar = sidecar or ( sidecar is None and ds.config.get('datalad.run.record-sidecar', default=False)) if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd), '"{}"'.format(record_id) if use_sidecar else record) msg = assure_bytes(msg) if not rerun_info and cmd_exitcode: msg_path = opj(relpath(ds.repo.repo.git_dir), "COMMIT_EDITMSG") with open(msg_path, "wb") as ofh: ofh.write(msg) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -r -F%s .'", msg_path) raise exc else: for r in ds.add('.', recursive=True, message=msg): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset(dataset, check_installed=True, purpose='tracking outcomes of a command') ds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. # MIH: is_dirty() is gone, but status() can do all of the above! if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=( 'clean dataset required to detect changes from command; ' 'use `datalad status` to inspect unsaved changes')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths( extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) # ATTN: For correct path handling, all dataset commands call should be # unbound. They should (1) receive a string dataset argument, (2) receive # relative paths, and (3) happen within a chpwd(pwd) context. if not inject: with chpwd(pwd): for res in prepare_inputs(ds_path, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds_path, outputs): yield res for res in _unlock_or_remove(ds_path, outputs.expand()): yield res if rerun_outputs is not None: for res in _unlock_or_remove(ds_path, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command( ds, cmd, pwd=pwd, dspath=ds_path, # Check if the command contains "{tmpdir}" to avoid creating an # unnecessary temporary directory in most but not all cases. tmpdir=mkdtemp(prefix="datalad-run-") if "{tmpdir}" in cmd else "", inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds_path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand() if explicit else None do_save = outputs_to_save is None or outputs_to_save if not rerun_info and cmd_exitcode: if do_save: repo = ds.repo msg_path = relpath(opj(str(repo.dot_git), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info( "The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad save -d . -r -F %s'", msg_path) raise exc elif do_save: with chpwd(pwd): for r in Save.__call__(dataset=ds_path, path=outputs_to_save, recursive=True, message=msg, return_type='generator'): yield r
def run_command(cmd, dataset=None, inputs=None, outputs=None, expand=None, explicit=False, message=None, sidecar=None, extra_info=None, rerun_info=None, extra_inputs=None, rerun_outputs=None, inject=False, saver=_save_outputs): """Run `cmd` in `dataset` and record the results. `Run.__call__` is a simple wrapper over this function. Aside from backward compatibility kludges, the only difference is that `Run.__call__` doesn't expose all the parameters of this function. The unexposed parameters are listed below. Parameters ---------- extra_info : dict, optional Additional information to dump with the json run record. Any value given here will take precedence over the standard run key. Warning: To avoid collisions with future keys added by `run`, callers should try to use fairly specific key names and are encouraged to nest fields under a top-level "namespace" key (e.g., the project or extension name). rerun_info : dict, optional Record from a previous run. This is used internally by `rerun`. extra_inputs : list, optional Inputs to use in addition to those specified by `inputs`. Unlike `inputs`, these will not be injected into the {inputs} format field. rerun_outputs : list, optional Outputs, in addition to those in `outputs`, determined automatically from a previous run. This is used internally by `rerun`. inject : bool, optional Record results as if a command was run, skipping input and output preparation and command execution. In this mode, the caller is responsible for ensuring that the state of the working tree is appropriate for recording the command's results. saver : callable, optional Must take a dataset instance, a list of paths to save, and a message string as arguments and must record any changes done to any content matching an entry in the path list. Must yield result dictionaries as a generator. Yields ------ Result records for the run. """ if not cmd: lgr.warning("No command given") return rel_pwd = rerun_info.get('pwd') if rerun_info else None if rel_pwd and dataset: # recording is relative to the dataset pwd = normpath(opj(dataset.path, rel_pwd)) rel_pwd = relpath(pwd, dataset.path) else: pwd, rel_pwd = get_command_pwds(dataset) ds = require_dataset( dataset, check_installed=True, purpose='tracking outcomes of a command') # not needed ATM #refds_path = ds.path lgr.debug('tracking command output underneath %s', ds) if not (rerun_info or inject): # Rerun already takes care of this. # For explicit=True, we probably want to check whether any inputs have # modifications. However, we can't just do is_dirty(..., path=inputs) # because we need to consider subdatasets and untracked files. if not explicit and ds.repo.dirty: yield get_status_dict( 'run', ds=ds, status='impossible', message=('unsaved modifications present, ' 'cannot detect changes by command')) return cmd = normalize_command(cmd) inputs = GlobbedPaths(inputs, pwd=pwd, expand=expand in ["inputs", "both"]) extra_inputs = GlobbedPaths(extra_inputs, pwd=pwd, # Follow same expansion rules as `inputs`. expand=expand in ["inputs", "both"]) outputs = GlobbedPaths(outputs, pwd=pwd, expand=expand in ["outputs", "both"]) if not inject: for res in prepare_inputs(ds, inputs, extra_inputs): yield res if outputs: for res in _install_and_reglob(ds, outputs): yield res for res in _unlock_or_remove(ds, outputs.expand(full=True)): yield res if rerun_outputs is not None: # These are files we need to unlock/remove for a rerun that aren't # included in the explicit outputs. Unlike inputs/outputs, these are # full paths, so we can pass them directly to unlock. for res in _unlock_or_remove(ds, rerun_outputs): yield res else: # If an inject=True caller wants to override the exit code, they can do # so in extra_info. cmd_exitcode = 0 exc = None try: cmd_expanded = format_command(ds, cmd, pwd=pwd, dspath=ds.path, inputs=inputs, outputs=outputs) except KeyError as exc: yield get_status_dict( 'run', ds=ds, status='impossible', message=('command has an unrecognized placeholder: %s', exc)) return if not inject: cmd_exitcode, exc = _execute_command( cmd_expanded, pwd, expected_exit=rerun_info.get("exit", 0) if rerun_info else None) # amend commit message with `run` info: # - pwd if inside the dataset # - the command itself # - exit code of the command run_info = { 'cmd': cmd, 'exit': cmd_exitcode, 'chain': rerun_info["chain"] if rerun_info else [], 'inputs': inputs.paths, 'extra_inputs': extra_inputs.paths, 'outputs': outputs.paths, } if rel_pwd is not None: # only when inside the dataset to not leak information run_info['pwd'] = rel_pwd if ds.id: run_info["dsid"] = ds.id if extra_info: run_info.update(extra_info) record = json.dumps(run_info, indent=1, sort_keys=True, ensure_ascii=False) if sidecar is None: use_sidecar = ds.config.get('datalad.run.record-sidecar', default=False) # If ConfigManager gets the ability to say "return single value", # update this code to use that. if isinstance(use_sidecar, tuple): # Use same precedence as 'git config'. use_sidecar = use_sidecar[-1] use_sidecar = anything2bool(use_sidecar) else: use_sidecar = sidecar if use_sidecar: # record ID is hash of record itself from hashlib import md5 record_id = md5(record.encode('utf-8')).hexdigest() record_dir = ds.config.get('datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(ds.path, record_dir, record_id) if not op.lexists(record_path): # go for compression, even for minimal records not much difference, despite offset cost # wrap in list -- there is just one record dump2stream([run_info], record_path, compressed=True) # compose commit message msg = u"""\ [DATALAD RUNCMD] {} === Do not change lines below === {} ^^^ Do not change lines above ^^^ """ msg = msg.format( message if message is not None else _format_cmd_shorty(cmd_expanded), '"{}"'.format(record_id) if use_sidecar else record) outputs_to_save = outputs.expand(full=True) if explicit else '.' if not rerun_info and cmd_exitcode: if outputs_to_save: msg_path = relpath(opj(ds.repo.path, ds.repo.get_git_dir(ds.repo), "COMMIT_EDITMSG")) with open(msg_path, "wb") as ofh: ofh.write(assure_bytes(msg)) lgr.info("The command had a non-zero exit code. " "If this is expected, you can save the changes with " "'datalad add -d . -r -F %s .'", msg_path) raise exc elif outputs_to_save: for r in saver(ds, outputs_to_save, msg): yield r
def __call__(path=None, spec=None, dataset=None, subject=None, anon_subject=None, acquisition=None, properties=None): # TODO: acquisition can probably be removed (or made an alternative to # derive spec and/or dicom location from) # Change, so path needs to point directly to dicom ds? # Or just use acq and remove path? dataset = require_dataset(dataset, check_installed=True, purpose="spec from dicoms") from datalad.utils import assure_list if path is not None: path = assure_list(path) path = [resolve_path(p, dataset) for p in path] else: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a path is required") # TODO: We should be able to deal with several paths at once # ATM we aren't (see also commit + message of actual spec) assert len(path) == 1 if not spec: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a spec file is required" ) # TODO: That's prob. wrong. We can derive default spec from acquisition else: spec = resolve_path(spec, dataset) spec_series_list = \ [r for r in json_py.load_stream(spec)] if op.exists(spec) else list() # get dataset level metadata: found_some = False for meta in dataset.meta_dump( path, recursive=False, # always False? reporton='datasets', return_type='generator', result_renderer='disabled'): if meta.get('status', None) not in ['ok', 'notneeded']: yield meta continue if 'dicom' not in meta['metadata']: # TODO: Really "notneeded" or simply not a result at all? yield dict(status='notneeded', message=("found no DICOM metadata for %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue if 'Series' not in meta['metadata']['dicom'] or \ not meta['metadata']['dicom']['Series']: yield dict( status='impossible', message=("no image series detected in DICOM metadata of" " %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue found_some = True overrides = dict() if properties: # load from file or json string props = json_py.load(properties) \ if op.exists(properties) else json_py.loads(properties) # turn into editable, pre-approved records props = { k: dict(value=v, approved=True) for k, v in props.items() } overrides.update(props) spec_series_list = add_to_spec( meta, spec_series_list, op.dirname(spec), subject=subject, anon_subject=anon_subject, # session=session, # TODO: parameter "session" was what # we now call acquisition. This is # NOT a good default for bids_session! # Particularly wrt to anonymization overrides=overrides, dataset=dataset) if not found_some: yield dict( status='impossible', message="found no DICOM metadata", path=path, type= 'file', # TODO: arguable should be 'file' or 'dataset', depending on path action='dicom2spec', logger=lgr) return # TODO: RF needed. This rule should go elsewhere: # ignore duplicates (prob. reruns of aborted runs) # -> convert highest id only # Note: This sorting is a q&d hack! # TODO: Sorting needs to become more sophisticated + include notion of :all spec_series_list = sorted(spec_series_list, key=lambda x: get_specval(x, 'id') if 'id' in x.keys() else 0) for i in range(len(spec_series_list)): # Note: Removed the following line from condition below, # since it appears to be pointless. Value for 'converter' # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so # it's not clear ATM what case this could possibly have catched: # heuristic.has_specval(spec_series_list[i], "converter") and \ if spec_series_list[i]["type"] == "dicomseries" and \ has_specval(spec_series_list[i], "bids-run") and \ get_specval(spec_series_list[i], "bids-run") in \ [get_specval(s, "bids-run") for s in spec_series_list[i + 1:] if get_specval( s, "description") == get_specval( spec_series_list[i], "description") and \ get_specval(s, "id") > get_specval( spec_series_list[i], "id") ]: lgr.debug("Ignore SeriesNumber %s for conversion" % i) spec_series_list[i]["tags"].append( 'hirni-dicom-converter-ignore') lgr.debug("Storing specification (%s)", spec) # store as a stream (one record per file) to be able to # easily concat files without having to parse them, or # process them line by line without having to fully parse them from datalad_hirni.support.spec_helpers import sort_spec # Note: Sorting paradigm needs to change. See above. # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x)) json_py.dump2stream(spec_series_list, spec) # make sure spec is in git: dataset.repo.set_gitattributes([(spec, { 'annex.largefiles': 'nothing' })], '.gitattributes') for r in Save.__call__(dataset=dataset, path=[spec, '.gitattributes'], to_git=True, message="[HIRNI] Added study specification " "snippet for %s" % op.relpath(path[0], dataset.path), return_type='generator', result_renderer='disabled'): if r.get('status', None) not in ['ok', 'notneeded']: yield r elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \ and r['type'] == 'file': r['action'] = 'dicom2spec' r['logger'] = lgr yield r elif r['type'] == 'dataset': # 'ok' or 'notneeded' for a dataset is okay, since we commit # the spec. But it's not a result to yield continue else: # anything else shouldn't happen yield dict( status='error', message=("unexpected result from save: %s", r), path= spec, # TODO: This actually isn't clear - get it from `r` type='file', action='dicom2spec', logger=lgr)
def __call__(path, dataset=None, spec_file=None, properties=None, replace=False): # TODO: message dataset = require_dataset(dataset, check_installed=True, purpose="hirni spec4anything") path = assure_list(path) path = [resolve_path(p, dataset) for p in path] res_kwargs = dict(action='hirni spec4anything', logger=lgr) res_kwargs['refds'] = Interface.get_refds_path(dataset) # ### This might become superfluous. See datalad-gh-2653 ds_path = PathRI(dataset.path) # ### updated_files = [] paths = [] for ap in AnnotatePaths.__call__( dataset=dataset, path=path, action='hirni spec4anything', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', # TODO: Check this one out: on_failure='ignore', # Note/TODO: Not sure yet whether and when we need those. # Generally we want to be able to create a spec for subdatasets, # too: # recursive=recursive, # recursion_limit=recursion_limit, # force_subds_discovery=True, # force_parentds_discovery=True, ): if ap.get('status', None) in ['error', 'impossible']: yield ap continue # ### This might become superfluous. See datalad-gh-2653 ap_path = PathRI(ap['path']) # ### # find acquisition and respective specification file: rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath) path_parts = rel_path.split('/') # TODO: Note: Outcommented this warning for now. We used to not have # a spec file at the toplevel of the study dataset, but now we do. # The logic afterwards works, but should be revisited. At least, # `acq` should be called differently now. # if len(path_parts) < 2: # lgr.warning("Not within an acquisition") acq = path_parts[0] # TODO: spec file specifiable or fixed path? # if we want the former, what we actually need is an # association of acquisition and its spec path # => prob. not an option but a config spec_path = spec_file if spec_file \ else posixpath.join(ds_path.posixpath, acq, dataset.config.get("datalad.hirni.studyspec.filename", "studyspec.json")) spec = [r for r in json_py.load_stream(spec_path)] \ if posixpath.exists(spec_path) else list() lgr.debug("Add specification snippet for %s", ap['path']) # XXX 'add' does not seem to be the thing we want to do # rather 'set', so we have to check whether a spec for a location # is already known and fail or replace it (maybe with --force) # go through all existing specs and extract unique value # and also assign them to the new record (subjects, ...), but only # editable fields!! uniques = dict() for s in spec: for k in s: if isinstance(s[k], dict) and 'value' in s[k]: if k not in uniques: uniques[k] = set() uniques[k].add(s[k]['value']) overrides = dict() for k in uniques: if len(uniques[k]) == 1: overrides[k] = _get_edit_dict(value=uniques[k].pop(), approved=False) if properties: # TODO: This entire reading of properties needs to be RF'd # into proper generalized functions. # spec got more complex. update() prob. can't simply override # (think: 'procedures' and 'tags' prob. need to be appended # instead) # load from file or json string if isinstance(properties, dict): props = properties elif op.exists(properties): props = json_py.load(properties) else: props = json_py.loads(properties) # turn into editable, pre-approved records spec_props = { k: dict(value=v, approved=True) for k, v in props.items() if k not in non_editables + ['tags', 'procedures'] } spec_props.update({ k: v for k, v in props.items() if k in non_editables + ['tags'] }) # TODO: still wrong. It's a list. Append or override? How to decide? spec_props.update({ o_k: [{ i_k: dict(value=i_v, approved=True) for i_k, i_v in o_v.items() }] for o_k, o_v in props.items() if o_k in ['procedures'] }) overrides.update(spec_props) # TODO: It's probably wrong to use uniques for overwriting! At least # they cannot be used to overwrite values explicitly set in # _add_to_spec like "location", "type", etc. # # But then: This should concern non-editable fields only, right? spec = _add_to_spec(spec, posixpath.split(spec_path)[0], ap, dataset, overrides=overrides, replace=replace) # Note: Not sure whether we really want one commit per snippet. # If not - consider: # - What if we fail amidst? => Don't write to file yet. # - What about input paths from different acquisitions? # => store specs per acquisition in memory # MIH: One commit per line seems silly. why not update all files # collect paths of updated files, and give them to a single `add` # at the very end? # MIH: if we fail, we fail and nothing is committed from datalad_hirni.support.spec_helpers import sort_spec json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)), spec_path) updated_files.append(spec_path) yield get_status_dict(status='ok', type=ap['type'], path=ap['path'], **res_kwargs) paths.append(ap) from datalad.dochelpers import single_or_plural from os import linesep message = "[HIRNI] Add specification {n_snippets} for: {paths}".format( n_snippets=single_or_plural("snippet", "snippets", len(paths)), paths=linesep.join(" - " + op.relpath(p['path'], dataset.path) for p in paths) if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path)) for r in dataset.save(updated_files, to_git=True, message=message, return_type='generator', result_renderer='disabled'): yield r