def run(ctx, args): """Runs an experiment """ print(args) # Strip potential operation from experiment name experiment, op_name = _strip_op_name_from_experiment(args) # Safe load of experiment file path try: experiment_config_file = \ config.get_project_config()["experiments"].get(experiment) except KeyError: cli.error("No experiments found. " "Are you sure you're in a Tracker project?") # Load configuration file experiment_config = config.load(experiment_config_file) # Create operation object # - Here we scan through the sourcecode # and extract the (hyper-)parameters op = oplib.Operation( op_name, _op_run_dir(args), _get_experiment_dict_by_name(experiment, experiment_config), _op_gpus(args), args.yes) # Prompt user to confirm run parameters if args.yes or _confirm_run(args, experiment, op): for n in range(args.trials): cli.out("Trial {}/{}".format(n + 1, args.trials)) # Run the trial _run(args, op)
def run(ctx, args): """Runs an experiment """ # Strip potential operation from experiment name exp_name, op_name = _strip_op_name_from_experiment(args) if op_name is None: op_name = DEFAULT_OP log.debug("Running experiment: '{}' with default operation: '{}' " "as no op was provided by the user!".format( exp_name, op_name)) # Safe load of experiment file path try: exp_conf_file = \ config.get_project_config()["experiments"].get(exp_name) except KeyError: cli.error("No experiments found. " "Are you sure you're in a Tracker project?") # Load configuration file exp_conf = config.load(exp_conf_file) # Create operation object op = oplib.Operation(op_name, _op_run_dir(args), _op_experiment(exp_name, exp_conf), _op_remote(args), _op_gpus(args)) # Prompt user to confirm run parameters if args.yes or _confirm_run(args, exp_name, op): for n in range(args.trials): cli.out("Trial {}/{}".format(n + 1, args.trials)) # Run the trial _run(args, op)
def run_dir_for_id(run_id): try: return _path_for_id(run_id) except NoSuchRun: cli.out("The trial with id: '{}' was not found\n" "Show trials by running 'tracker experiment NAME " "--list_trials'.".format(run_id)) cli.error("No such directory", errno.ENOENT)
def _op_run_dir(args): if args.run_dir: run_dir = os.path.abspath(args.run_dir) if os.getenv("NO_WARN_RUNDIR") != "1": cli.out("Run directory is '{}' (results will not be " "visible to Tracker)".format(run_dir)) return run_dir else: return None
def remote_op(op, prompt, default_resp, args): if not args.yes: cli.out(prompt) if args.yes or cli.confirm("Continue?", default_resp): try: op() except OperationNotSupported as e: cli.error(e) except OperationError as e: cli.error(e)
def _maybe_apply_default_runs(args): n_runs = len(args.runs) if n_runs == 0: raise NotImplementedError # args.run = ("2", "1") elif n_runs == 1: cli.out( "The `diff` command requires two runs.\n" "Try specifying a second run or 'tracker diff --help' " "for more information.") cli.error() elif n_runs > 2: cli.out( "The `diff` command cannot compare more than two runs.\n" "Try specifying just two runs or 'tracker diff --help' " "for more information.") cli.error() else: assert n_runs == 2, args
def _run_remote(op, args): remote = remotelib.remote_for_args(args) try: run_id = remote.run_op(**_run_kw(args)) except remotelib.RunFailed as e: _handle_remote_run_failed(e, remote) except remotelib.RemoteProcessError as e: _handle_remote_process_error(e) except remotelib.RemoteProcessDetached as e: _handle_remote_process_detached(e, args.remote) except remotelib.OperationError as e: _handle_remote_op_error(e, remote) except remotelib.OperationNotSupported: cli.error("{} does not support this operation".format(remote.name)) else: if args.background: cli.out("{run_id} is running remotely on {remote}\n" "To watch use 'tracker watch {run_id} -r {remote}'".format( run_id=run_id[:8], remote=args.remote))
def _tail(run): if os.getenv("NO_WATCHING_MSG") != "1": cli.out("Watching run %s (pid: %s)" % (run.id, run.pid), err=True) if run.pid is None: _print_output(run) return proc = psutil.Process(run.pid) output_path = run.tracker_path("output") f = None while proc.is_running(): f = f or _try_open(output_path) if not f: time.sleep(1.0) continue line = f.readline() if not line: time.sleep(0.1) continue sys.stdout.write(line) sys.stdout.flush()
def _handle_remote_process_detached(e, remote): run_id = e.args[0] cli.out("\nDetached from remote run {run_id} (still running)\n" "To re-attach use 'tracker watch {run_id} -r {remote}'".format( run_id=run_id[:8], remote=remote))
def _handle_remote_run_failed(e, remote): run_id = os.path.basename(e.remote_run_dir) cli.out("Try 'tracker runs info %s -O -r %s' to view its output." % (run_id[:8], remote.name), err=True) cli.error()
def _print_run_status(run): cli.out("Run %s stopped with a status of '%s'" % (run.short_id, run.status), err=True)
def _stopped_msg(run): msg = "\nStopped watching %s" % run.short_id if run.pid and psutil.Process(run.pid).is_running(): msg += " (%s still running)" % run.pid cli.out(msg)