def wrapped_main(*args, **kwargs): if len(args) > 0 and isinstance(args[0], ParameterSet): # If the first argument is a ParameterSet parameters = args[0] elif len(kwargs) > 0 and "parameters" in kwargs \ and isinstance(kwargs["parameters"], ParameterSet): # If there is a named "parameters" argument parameters = kwargs["parameters"] else: # Package all parameters into a SimpleParameterSet parameters = dict( zip(["arg%d" % x for x in range(len(args))], args)) parameters.update(kwargs) parameters = SimpleParameterSet(parameters) import sumatra.projects project = sumatra.projects.load_project() main_file = sys.modules['__main__'].__file__ executable = PythonExecutable(path=sys.executable) record = project.new_record(parameters=parameters, main_file=main_file, executable=executable) record.launch_mode.working_directory = os.getcwd() parameters.update({"sumatra_label": record.label}) start_time = time.time() with _grab_stdout_stderr() as stdout_stderr: main(*args, **kwargs) record.stdout_stderr = stdout_stderr.getvalue() record.duration = time.time() - start_time record.output_data = record.datastore.find_new_data(record.timestamp) project.add_record(record) project.save()
def wrapped_main(*args, **kwargs): if len(args) > 0 and isinstance(args[0], ParameterSet): # If the first argument is a ParameterSet parameters = args[0] elif len(kwargs) > 0 and "parameters" in kwargs \ and isinstance(kwargs["parameters"], ParameterSet): # If there is a named "parameters" argument parameters = kwargs["parameters"] else: # Package all parameters into a SimpleParameterSet parameters = dict( zip(["arg%d" % x for x in range(len(args))], args)) parameters.update(kwargs) parameters = SimpleParameterSet(parameters) import sumatra.projects project = sumatra.projects.load_project() main_file = sys.modules['__main__'].__file__ executable = PythonExecutable(path=sys.executable) record = project.new_record(parameters=parameters, main_file=main_file, executable=executable) record.launch_mode.working_directory = os.getcwd() parameters.update({"sumatra_label": record.label}) record.add_tag(STATUS_FORMAT % "running") record.stdout_stderr = "Not yet captured." project.add_record(record) start_time = time.time() with _grab_stdout_stderr() as stdout_stderr: try: main(*args, **kwargs) status = "finished" except KeyboardInterrupt: status = "killed" except Exception as e: status = "failed" record.outcome = repr(e) traceback.print_exc() finally: record.stdout_stderr = stdout_stderr.getvalue() record.add_tag(STATUS_FORMAT % (status + "...")) project.save_record(record) record.duration = time.time() - start_time record.output_data = record.datastore.find_new_data(record.timestamp) record.add_tag(STATUS_FORMAT % status) project.save_record(record) project.save()
def wrapped_main(parameters, *args, **kwargs): import sumatra.projects project = sumatra.projects.load_project() main_file = sys.modules['__main__'].__file__ executable = PythonExecutable(path=sys.executable) record = project.new_record(parameters=parameters, main_file=main_file, executable=executable) record.launch_mode.working_directory = os.getcwd() parameters.update({"sumatra_label": record.label}) start_time = time.time() with _grab_stdout_stderr() as stdout_stderr: main(parameters, *args, **kwargs) record.stdout_stderr = stdout_stderr.getvalue() record.duration = time.time() - start_time record.output_data = record.datastore.find_new_data(record.timestamp) project.add_record(record) project.save()
def run(self, cache=None, recompute=False, record=None): """ To completely disable recording, use `config.disable_recording = True`. Parameters ---------- cache: bool Set to True to enable in-memory caching. If unspecified, read from class' default, and if that is also not set, from `config`. recompute: bool Force task to execute, even if it is cached. (default: False) record: bool Set to False to disable recording to Sumatra database. If unspecified, read from `config` (default config: True). """ # Dereference links: links may change, so in the db record we want to # save paths to actual files # Typically these are files in the output datastore, but we save # paths relative to the *input* datastore.root, because that's the root # we would use to reexecute the task. # input_files = [os.path.relpath(os.path.realpath(input), # start=config.project.input_datastore.root) # for input in self.input_files] if cache is None: cache = self.cache if self.cache is not None else config.cache_runs if record is None: record = config.record inroot = Path(config.project.input_datastore.root) outputs = None # First try to load pre-computed result if self._run_result is NotComputed and not recompute: # First check if output has already been produced _outputs = deque() try: for nm, p in zip(self.outputs, self._outputpaths_gen): if isinstance(self.outputs, dict): format = self.outputs[nm] else: format = None _outputs.append(io.load(inroot/p, format=format)) except FileNotFoundError: pass else: logger.debug( type(self).__qualname__ + ": loading result of previous " "run from disk.") # Only assign to `outputs` once all outputs are loaded successfully outputs = tuple(_outputs) elif not recompute: logger.debug( type(self).__qualname__ + ": loading from in-memory cache") outputs = self._run_result if outputs is None: # We did not find a previously computed result, so run the task logger.debug( type(self).__qualname__ + ": No cached result was found; " "running task.") input_data = [input.generate_key() for input in self.input_files] module = sys.modules[type(self).__module__] # Module where task is defined if record: # Append a few chars from digest so simultaneous runs don't # have clashing labels label = datetime.now().strftime(TIMESTAMP_FORMAT) + '_' + self.digest[:4] smtrecord = config.project.new_record( parameters=self.desc, input_data=input_data, script_args=type(self).__name__, executable=PythonExecutable(sys.executable), main_file=module.__file__, reason=self.reason, label=label ) start_time = time.time() elif not config.allow_uncommitted_changes: # Check that changes are committed. This is normally done in new_record(). # See sumatra/projects.py:Project.new_record repository = deepcopy(config.project.default_repository) working_copy = repository.get_working_copy() config.project.update_code(working_copy) outputs = self._run(**self.load_inputs()) if not isinstance(outputs, Iterable): warn("Task {} did not return a tuple. This will cause " "problems when composing with other tasks.") if record: smtrecord.duration = time.time() - start_time if len(outputs) == 0: warn("No output was produced.") elif record: realoutputpaths = self.write(outputs) if len(realoutputpaths) != len(outputs): warn("Something went wrong when writing task outputs. " f"\nNo. of outputs: {len(outputs)} " f"\nNo. of output paths: {len(realoutputpaths)}") smtrecord.outcome("Error while writing to disk: possibly " "missing or unrecorded data.") smtrecord.output_data = [ DataFile(path, config.project.data_store).generate_key() for path in realoutputpaths] if record: config.project.add_record(smtrecord) if cache and self._run_result is NotComputed: self._run_result = outputs return outputs
def test__write_parameters__should_call_save_on_the_parameter_set(self): prog = PythonExecutable(None) params = MockParameterSet() prog.write_parameters(params, "test_parameters") self.assert_(params.saved)
from __future__ import unicode_literals from builtins import range from sumatra.projects import Project from sumatra.records import Record from sumatra.recordstore import django_store from sumatra.programs import PythonExecutable from sumatra.launch import SerialLaunchMode from sumatra.datastore import FileSystemDataStore from sumatra.parameters import SimpleParameterSet from sumatra.versioncontrol._git import GitRepository import random serial = SerialLaunchMode() executable = PythonExecutable("/usr/bin/python", version="2.7") repos = GitRepository('.') datastore = FileSystemDataStore("/path/to/datastore") project = Project("test_project", default_executable=executable, default_repository=repos, default_launch_mode=serial, data_store=datastore, record_store=django_store.DjangoRecordStore()) parameters = SimpleParameterSet({'a': 2, 'b': 3}) for i in range(50): record = Record(executable=executable, repository=repos, main_file="main.py", version="99863a9dc5f", launch_mode=serial, datastore=datastore,
def _run_and_record(self, record: bool=None, record_store=None): # Remark: Refer to sumatra.decorators.capture for a similar pattern # DEVNOTE: status values should be limited to those defined in the # `style_map` variable of sumatra.web.templatetags.filters:labelize_tag # Otherwise the smt web interface returns an exception if record is None: record = config.record input_data = [input.generate_key() for input in self.input_files] # Module where task is defined # Decorators set the _module_name attribute explicitely, because with the # dynamically created class, the `type(self)` method gets the module wrong module_name = getattr(self, '_module_name', type(self).__module__) module = sys.modules[module_name] old_status='pre_run' status='running' self.logger.debug(f"Status: '{old_status}' → '{status}'.") if record: if record_store: # Update config to use a different record store import tempfile import shutil from sumatra.projects import _get_project_file from sumatra.recordstore import DjangoRecordStore # `record_store` may specify a new location – in this case, # ensure that parent directories exist self.logger.debug("Configuring task to use the non-default " f"record store at location {record_store}.") Path(record_store).parent.mkdir(parents=True, exist_ok=True) config.project.record_store = DjangoRecordStore(record_store) # Problem: We can change the attributes of the Sumatra project # project in place, but when Sumatra saves the record, it # updates the .smt/project file such that the value of # `record_store` becomes the default for all future Task # executions. # Solution: Create a throwaway project directory, and also # change project.path to point there. This doesn't change # how Sumatra behaves (project values are already loaded # into runtime memory at this point), but any attempts by # Sumatra to permanently change the project configuration # are redirected to this throwaway directory, and discarded # when we exit this function. tmpproject_dir = tempfile.mkdtemp() tmpproject_file = _get_project_file(tmpproject_dir) Path(tmpproject_file).parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(_get_project_file(config.project.path), tmpproject_file) config.project.path = tmpproject_dir self.logger.debug("Created throwaway project directory " f"at location {tmpproject_dir}.") # Append a few chars from digest so simultaneous runs don't # have clashing labels # Probabilities of having a collision after 1000 times, if we run n tasks simultaneously each time # (Only tasks run simultaneously may clash, b/c of the timestamp) # Digest length | P_coll (12 tasks) | (24 tasks) # 4 | 63.47 | 98.52% # 6 | 0.39% | 1.63% # 8 | 0.0015% | 0.0064% label = datetime.now().strftime(TIMESTAMP_FORMAT) + '_' + self.digest[:6] # Sumatra will still work without wrapping parameters with # ParameterSet, but that is the format it expects. Moreover, # doing this allows the smtweb interface to display parameters. # NOTE: Sumatra doesn't recognize the Pydantic-aware types, and # serializes a ParameterSet more or less by taking its str. # Thus to make the recorded parameters Sumatra-safe, we use # our own JSON serializer, and parse the result back into # a ParameterSet – this results in a ParameterSet containing # only JSON-valid entries. self.logger.debug("Parsing parameters from Task description.") parameter_str = self.desc.json(indent=2) try: # parameters=config.ParameterSet(utils.full_param_desc(self)) parameters = config.ParameterSet(parameter_str) except Exception as e: # If creation of ParameterSet fails, save parameters as-is self.logger.debug("Creation of a ParameterSet failed; saving as " "JSON string. The smtweb will not be able to " "browse/filter parameter values.") parameters = parameter_str self.logger.debug(f"Creating a new Task record with label '{label}'...") smtrecord = config.project.new_record( parameters=parameters, input_data=input_data, script_args=type(self).__name__, executable=PythonExecutable(sys.executable), main_file=module.__file__, reason=self.reason, label=label ) self.logger.debug("Task record created.") smtrecord.add_tag(STATUS_FORMAT % status) self.logger.debug(f"Adding record to Sumatra project '{config.project.name}'...") config.project.add_record(smtrecord) self.logger.debug("Record added to project.") self.logger.debug(f"Task execution start time: {datetime.now()}") start_time = time.time() elif not config.allow_uncommitted_changes: # Check that changes are committed. This is normally done in new_record(). # See sumatra/projects.py:Project.new_record self.logger.debug("Task will not be recorded but config states to still check for uncommitted changes.") repository = deepcopy(config.project.default_repository) working_copy = repository.get_working_copy() config.project.update_code(working_copy) self.logger.debug("No uncommited change detected.") outputs = EmptyOutput(status=status) try: self.logger.debug("Executing the task’s code...") run_result = self._run(**dict(self.load_inputs())) # We don't use .dict() here, because that would dictifiy all nested # BaseModels, which would then be immediately recreated from their dicts self.logger.debug("Finished executing task’s code.") old_status = status status = "finished" self.logger.debug(f"Status: '{old_status}' → '{status}'.") self.logger.debug("Parsing task results...") outputs = self.Outputs.parse_result(run_result, _task=self) except (KeyboardInterrupt, SystemExit): self.logger.debug("Caught KeyboardInterrupt") old_status = status status = "killed" self.logger.debug(f"Status: '{old_status}' → '{status}'.") outputs = EmptyOutput(status=status) # When executing with multiprocessing, the mother process also # receives the interrupt and kills the spawned process. # The only statements that *are* executed are those within exception # handlers. So we need to reraise, to allow the unique_process_num # context manager in smttask.ui._run_task to clean up raise KeyboardInterrupt except Exception as e: old_status = status status = "crashed" self.logger.debug(f"Status: '{old_status}' → '{status}'.") if record: if smtrecord.outcome != "": smtrecord.outcome += "\n" + repr(e) else: smtrecord.outcome = repr(e) outputs = EmptyOutput(status=status) if config.on_error == 'raise': raise TaskExecutionError(self) from e else: traceback.print_exc() finally: # We place this in a finally clause, instead of just at the end, to # ensure this is executed even after a SIGINT during multiprocessing. if record: smtrecord.add_tag(STATUS_FORMAT % status) smtrecord.duration = time.time() - start_time if getattr(outputs, 'outcome', ""): if smtrecord.outcome != "": smtrecord.outcome += "\n" if isinstance(outputs.outcome, str): smtrecord.outcome += outputs.outcome elif isinstance(outputs.outcome, (tuple, list)): smtrecord.outcome += "\n".join( (str(o) for o in outputs.outcome)) else: self.logger.warn("Task `outcome` should be either a string " "or tuple of strings. Coercing to string.") smtrecord.outcome += str(outputs.outcome) if len(outputs) == 0: self.logger.warn("No output was produced.") elif record and status == "finished": self.logger.debug("Saving output...") smtrecord.add_tag(STATUS_FORMAT % status) realoutputpaths = outputs.write() if len(realoutputpaths) != len(outputs): warn("Something went wrong when writing task outputs. " f"\nNo. of outputs: {len(outputs)} " f"\nNo. of output paths: {len(realoutputpaths)}") if smtrecord.outcome != "": smtrecord.outcome += "\n" smtrecord.outcome += ("Error while writing to disk: possibly " "missing or unrecorded data.") else: old_status = status status = "finished" self.logger.debug(f"Status: '{old_status}' → '{status}'.") # NB: `path` is absolute. `path` & `data_store.root` may include a symlink, so we need to resolve them to get the right anchor for a relative path outroot = Path(config.project.data_store.root).resolve() # Convert to relative output paths in a way which ensures we don't error out just before writing if there is an error relativeoutputpaths = [] # Fill list one at a time, so that we use the fallback path only for those which fail the conversion to relative (in theory, should be all or none, but also in theory, there should be no errors here) for path in realoutputpaths: try: relpath = Path(path).resolve().relative_to(outroot) except Exception: # (Normally this should be ValueError, but since we want to make sure we write out the results, we catch any exception. The only thing we want to let pass through are interrupt signals) # For some unexpected reason, computing a relative path failed. Fall back to using the path iself; it might not be fully correct, but should provide enough info to allow the user to find the file relpath = path relativeoutputpaths.append(relpath) smtrecord.output_data = [ DataFile(str(relpath), config.project.data_store).generate_key() for relpath in relativeoutputpaths] self.logger.debug(f"Task {status}") smtrecord.add_tag(STATUS_FORMAT % status) if record: config.project.save_record(smtrecord) config.project.save() self.logger.debug("Saved record") if record_store: # Remove the directory with throwaway project file self.logger.debug("Removing throwaway project directory " f"at location '{tmpproject_dir}'.") shutil.rmtree(tmpproject_dir, ignore_errors=True) return outputs
def create_surrogates(taskdesc, keep, dry_run, verbose, quiet): """ Create surrogate records for outputs without records. For each provided TASKDESC file, check 1) If outputs for that task are stored on disk, indicating that it was already run. 2) If there is a matching entry in the record store. If (1) is true but (2) is false, then a new surrogate record is created, to associate the task desc with the output. Any number of TASKDESC files may be provided, and directories will be recursed into. This allows routines which query the record store for outputs to work as expected, but of course statistics like run time for surrogate records are undefined. The "surrogate" tag is added to all surrogate records. Reasons for having task outputs without associate record store entries include executing a task without recording, merging data stores without merging the associated record stores, and write conflicts when multiple processes attempt to access the record store simultaneously. It may be easier to understand this function with a sample of its output; such an example can be found in the smttask docs at this location: :doc:`smttask/docs/user-api/example_output_smttask_store_create-surrogates.md </user-api/example_output_smttask_store_create-surrogates.md>`. """ taskdesc = tuple( Path(p) for p in taskdesc ) # With v8, we could do this by passing a 'path_type' argument to click.Path import sys import shutil from sumatra.core import TIMESTAMP_FORMAT, STATUS_FORMAT from sumatra.programs import PythonExecutable from sumatra.datastore.filesystem import DataFile # Set up logging verbose *= 10 quiet *= 10 # Logging levels are in steps of 10 default = logging.INFO loglevel = max(min(default + quiet - verbose, logging.CRITICAL), logging.DEBUG) logging.basicConfig(level=loglevel, force=True) # force=True to reset the root logger in case it was already created rsview = RecordStoreView() record_outputpaths = { frozenset(str(Path(p).resolve()) for p in rec.outputpaths) for rec in tqdm(rsview, desc="Scanning record store") } # Concatenate taskdesc files, recursing into directories taskdesc_files = [] for taskdesc_path in taskdesc: if taskdesc_path.is_dir(): for dirpath, dirnames, filenames in os.walk(taskdesc_path): # At present, the cost of sorting `filenames` seems to be worth # it to have predictable execution and easier to read output. # TODO: Sorting is imperfect: 'task-9' sorts after 'task-100' taskdesc_files.extend( sorted(Path(dirpath) / filename for filename in filenames)) else: taskdesc_files.append(Path(taskdesc_path)) if len(taskdesc_files) == 0: print("No task files were specified. Exiting.") return taskfiles_to_delete = [] taskfiles_to_add_as_records = [] taskfiles_untouched = [] for taskpath in tqdm(taskdesc_files, desc="Iterating over task files"): if not taskpath.exists(): taskfiles_to_delete.append((taskpath, "(∄ task desc)")) continue task = Task.from_desc(taskpath) # Set the `outroot` after loading `task`: loading the task may change the project directory outroot = Path(config.project.data_store.root) outputpaths = frozenset( str((outroot / p).resolve()) for p in task.outputpaths.values()) if task.saved_to_input_datastore: # The task's output files have been found on disk if outputpaths in record_outputpaths: # There is at least one record pointing to these output files taskfiles_to_delete.append((taskpath, "(∃ output, ∃ record)")) else: # The outputs exist, but no record points to them # => Add a surrogate record taskfiles_to_add_as_records.append(taskpath) taskfiles_to_delete.append((taskpath, "(∃ output, ∄ record)")) # TODO: DRY with task_types._run_and_record() input_data = [ input.generate_key() for input in task.input_files ] module_name = getattr(task, '_module_name', type(task).__module__) module = sys.modules[module_name] parameter_str = task.desc.json(indent=2) try: parameters = config.ParameterSet(parameter_str) except Exception as e: parameters = parameter_str label = datetime.now().strftime( TIMESTAMP_FORMAT) + '_' + task.digest[:6] if not dry_run: smtrecord = config.project.new_record( parameters=parameters, input_data=input_data, script_args=type(task).__name__, executable=PythonExecutable(sys.executable), main_file=module.__file__, reason=task.reason, label=label) smtrecord.version = "<unknown>" # Set version after creating record, otherwise the # no modifications check will fail # TODO: Can we disable to modification check completely ? # It's not relevant anyway. smtrecord.add_tag(STATUS_FORMAT % "finished") # NB: status must be one of those in sumatra.web.templatetags.filters:labelize_tag:style_map smtrecord.add_tag("surrogate") # TODO: Again, DRY with task_types._run_and_record() smtrecord.output_data = [ DataFile( Path(path).relative_to(outroot.resolve()), config.project.data_store).generate_key() for path in outputpaths ] config.project.save_record(smtrecord) config.project.save() else: taskfiles_untouched.append(taskpath) have_been = "would be" if dry_run else "have been" will = "would" if dry_run else "will" if taskfiles_to_add_as_records: print(f"Surrogate records {have_been} added for the following tasks:") for taskpath in taskfiles_to_add_as_records: print(f" {taskpath}") if not keep and taskfiles_to_delete: print(f"\nThe following task files {will} be removed:") termcols = shutil.get_terminal_size().columns w = max(len(str(t[0])) for t in taskfiles_to_delete) w = min(w, termcols - 25) # Max columns to use for the path before truncating # 20: max width of `reason` | 3: spacing used in formatted string for taskpath, reason in taskfiles_to_delete: pathstr = str(taskpath) if len(pathstr) > w: pathstr = "…" + pathstr[-w - 1:] print(f" {pathstr:<{w}} {reason}") if taskfiles_untouched: print(f"\nThe following task files {will} be kept since no " "corresponding output files were found:") for taskpath in taskfiles_untouched: print(f" {taskpath}") if not keep and not dry_run: for taskpath, _ in taskfiles_to_delete: try: os.remove(taskpath) except (OSError, FileNotFoundError): pass print("Aforementioned task files have been removed.")