def build_cache(self, inputs=None, output=None, output_cols=None): self.input_cols = [None] * len(inputs) for idx, param in enumerate(inputs): self.input_cols[idx] = {"index": idx, "param": param, "width": 0, "title": DictUtils.get(BenchData.Reporter.TITLES, param, param), "vals": sorted(self.bench_data.select_values(param))} self.output_param = output output_cols = output_cols if output_cols else sorted(self.bench_data.select_values(output)) self.output_cols = [None] * len(output_cols) for idx, param_value in enumerate(output_cols): self.output_cols[idx] = {"index": idx, "value": param_value, "title": param_value, "width": len(BenchData.Reporter.to_string(param_value))} self.cache = {} for bench in self.bench_data.benchmarks(): if BenchData.status(bench) != "ok": continue bench_key = [] for input_col in self.input_cols: param_value = DictUtils.get(bench, input_col['param'], None) if not param_value: bench_key = [] break bench_key.append(str(param_value)) if bench_key: output_val = DictUtils.get(bench, self.output_param, None) if output_val: bench_key = '.'.join(bench_key + [str(output_val)]) if bench_key not in self.cache: self.cache[bench_key] = bench else: raise ValueError("Duplicate benchmark with key = {}".format(bench_key))
def summary(self, params=None): """Return summary of benchmarks providing additional info on `params`. Args: params (list): List of parameters to provide additional info for. If empty, default list is used. Returns: dict: A summary of benchmarks. """ if not params: params = ['exp.node_id', 'exp.node_title', 'exp.gpu_title', 'exp.gpu_id', 'exp.framework_title', 'exp.framework_id'] summary_dict = { 'num_benchmarks': len(self.__benchmarks), 'num_failed_benchmarks': 0, 'num_successful_benchmarks': 0 } for param in params: summary_dict[param] = set() for bench in self.__benchmarks: if DictUtils.get(bench, 'results.time', -1) > 0: summary_dict['num_successful_benchmarks'] += 1 else: summary_dict['num_failed_benchmarks'] += 1 for param in params: summary_dict[param].add(DictUtils.get(bench, param, None)) for param in params: summary_dict[param] = list(summary_dict[param]) return summary_dict
def check_variable_value(self, experiment, var): """ Check if variable contains correct value according to parameter info. Args: experiment (dict): A dictionary with experiment parameters. var (str): Name of a parameter. """ if self.param_info is None or var not in self.param_info: return pi = self.param_info[var] ParamUtils.check_value( var, # Parameter name experiment[var], # Parameter value DictUtils.get(pi, 'val_domain', None), # Value domain constraints. DictUtils.get(pi, 'val_regexp', None)) # Value regexp constraints.
def get_header(self): header = "" for input_col in self.input_cols: format_str = " %-" + str(input_col['width']) + "s" header = header + format_str % BenchData.Reporter.to_string(input_col['title']) header += " " output_cols_title = " " * len(header) + DictUtils.get(BenchData.Reporter.TITLES, self.output_param, self.output_param) for output_col in self.output_cols: format_str = "%+" + str(output_col['width']) + "s " header = header + format_str % BenchData.Reporter.to_string(output_col['title']) return [output_cols_title, header]
def init(self, **kwargs): """Initializes experimenter. Args: **kwargs (dict): Optional initialization parameters: - action (str): Action to perform. - config (str): A user-provided configuration file. - plan (str): A file for generated benchmark plan. - no_validation (bool): If true, do not perform validation - progress_file (str): A path to progress file (if not None, enables progress reporting). - params (dict): User defined parameters. - vars (dict): User defined variables. - discard_default_config (bool): If True, do not load standard DLBS config. - extensions (dict): User provided extensions. User provided parameters (`params`), variables (`vars`) and extensions (`extensions`) overwrite values defined in user configuration files (`config`) if it is present. Information defined in a uses-provided configuration file (`config`) overwrites standard DLBS configuration. """ if self.__initialized: raise RuntimeError("Experimenter can only be initialized once.") self.action = DictUtils.get(kwargs, 'action', 'run') self.config_file = DictUtils.get(kwargs, 'config', None) self.plan_file = DictUtils.get(kwargs, 'plan', None) self.validation = not DictUtils.get(kwargs, 'no_validation', False) self.__progress_file = DictUtils.get(kwargs, 'progress_file', None) # Get parameters and variables from a command line/user-provided self.params.update(DictUtils.get(kwargs, 'params', {})) self.variables.update(DictUtils.get(kwargs, 'vars', {})) # Load default configuration if not DictUtils.get(kwargs, 'discard_default_config', False): logging.debug("Loading default configuration") _, self.config, self.param_info = ConfigurationLoader.load( os.path.join(os.path.dirname(__file__), 'configs')) # Load configurations specified on a command line self.load_configuration() # Add extensions from command line DictUtils.ensure_exists(self.config, 'extensions', []) self.config['extensions'].extend( DictUtils.get(kwargs, 'extensions', [])) # All's done self.__initialized = True
def select_values(self, key): """Return unique values for the `key` across all benchmarks. A missing key in a benchmark is considered to be a key having None value. Args: key (str): A key to return unique values for. Returns: list: sorted list of values. """ selected = set() for benchmark in self.__benchmarks: selected.add(DictUtils.get(benchmark, key, None)) return sorted(list(selected))
def status(arg): """ Return status of the benchmark stored in a log file `log_file`. Args: arg: A name of a log file, a dictionary or an instance of the BenchData class. Returns: str or None: "ok" for successful benchmark, "failure" for not and None for other cases (such as no file). """ if isinstance(arg, Six.string_types): bench_data = BenchData.parse(arg) elif isinstance(arg, dict): bench_data = BenchData([arg], create_copy=False) elif isinstance(arg, BenchData): bench_data = arg else: raise TypeError("Invalid argument type (={}). Expecting string, BenchData".format(type(arg))) if len(bench_data) == 1: return 'ok' if DictUtils.get(bench_data[0], 'results.time', -1) > 0 else 'failure' return None
def build_cache(self, inputs=None, output=None, output_cols=None): self.input_cols = [None] * len(inputs) for idx, param in enumerate(inputs): self.input_cols[idx] = { "index": idx, "param": param, "width": 0, "title": DictUtils.get(BenchData.Reporter.TITLES, param, param), "vals": sorted(self.bench_data.select_values(param)) } self.output_param = output output_cols = output_cols if output_cols else sorted( self.bench_data.select_values(output)) self.output_cols = [None] * len(output_cols) for idx, param_value in enumerate(output_cols): self.output_cols[idx] = { "index": idx, "value": param_value, "title": param_value, "width": len(BenchData.Reporter.to_string(param_value)) } self.cache = {} print("Number of benchmarks = {}".format( len(self.bench_data.benchmarks())), file=sys.stderr) for bench in self.bench_data.benchmarks(): if BenchData.status(bench) != "ok": print( "Ignoring failed benchmark: exp.id={}, exp.status={}, results.time={}, exp.model={}, " "exp.replica_batch={}, exp.dtype={}, exp.num_gpus={}.". format(bench.get('exp.id', 'UNKNOWN'), bench.get('exp.status', 'UNKNOWN'), bench.get('results.time', -1), bench.get('exp.model', 'UNKNOWN'), bench.get('exp.replica_batch', 'UNKNOWN'), bench.get('exp.dtype', 'UNKNOWN'), bench.get('exp.num_gpus', -1)), file=sys.stderr) continue # The 'bench_key' is the composite benchmark ID that includes values of input and output variable, for # instance ['VGG16', 128, 2] may mean [ModelTitle, ReplicaBatch, NumGPUs]. bench_key = [] # Build initial version of the key taking into account input parameters. for input_col in self.input_cols: # The param_value is the value of an output parameter, for instance, number of GPUs param_value = DictUtils.get(bench, input_col['param'], None) if not param_value: bench_key = [] break bench_key.append(str(param_value)) if bench_key: output_val = DictUtils.get(bench, self.output_param, None) if output_val: bench_key = '.'.join(bench_key + [str(output_val)]) if bench_key not in self.cache: self.cache[bench_key] = bench else: raise ValueError( "Duplicate benchmark with key = {}".format( bench_key)) else: pass
def run(plan, progress_file=None): """Runs experiments in `plan` one at a time. In newest versions of this class the `plan` array must contain experiments with computed variables. Args: plan (list): List of benchmarks to perform (list of dictionaries). progress_file (str): A file for a progress reporter. If None, no progress will be reported. """ num_experiments = len(plan) # See if resource monitor needs to be run. Now, the assumption is that # if it's enabled for a first experiments ,it's enabled for all others. resource_monitor = None if num_experiments > 0 and DictUtils.get(plan[0], 'monitor.frequency', 0) > 0: if not os.path.isdir(plan[0]['monitor.pid_folder']): os.makedirs(plan[0]['monitor.pid_folder']) resource_monitor = ResourceMonitor(plan[0]['monitor.launcher'], plan[0]['monitor.pid_folder'], plan[0]['monitor.frequency'], plan[0]['monitor.timeseries']) # The file must be created beforehand - this is required for docker to # to keep correct access rights. resource_monitor.empty_pid_file() resource_monitor.run() # It's used for reporting progress to a user num_active_experiments = 0 for experiment in plan: if DictUtils.get(experiment, 'exp.status', '') not in ['disabled', 'inactive']: num_active_experiments += 1 progress_tracker = ProgressTracker(num_experiments, num_active_experiments, progress_file) # Setting handler for SIGUSR1 signal. Users can send this signal to this # script to gracefully terminate benchmarking process. print("--------------------------------------------------------------") print("Experimenter pid %d. Run this to gracefully terminate me:" % os.getpid()) print("\tkill -USR1 %d" % os.getpid()) print("I will terminate myself as soon as current benchmark finishes.") print("--------------------------------------------------------------") sys.stdout.flush() Launcher.must_exit = False def _sigusr1_handler(signum, frame): Launcher.must_exit = True signal.signal(signal.SIGUSR1, _sigusr1_handler) for idx in range(num_experiments): if Launcher.must_exit: logging.warn( "The SIGUSR1 signal has been caught, gracefully shutting down benchmarking " "process on experiment %d (out of %d)", idx, num_experiments) break experiment = plan[idx] # Is experiment disabled? if DictUtils.get(experiment, 'exp.status', '') in ('disabled', 'inactive'): logging.info( "Will not run benchmark, reason: exp.status='%s'" % experiment['exp.status']) progress_tracker.report(experiment['exp.log_file'], exec_status='inactive') continue # If experiments have been ran, check if we need to re-run. if DictUtils.get(experiment, 'exp.log_file', None) is not None: if isfile(experiment['exp.log_file']): bench_status = None no_rerun_msg = None rerun_condition = DictUtils.get(experiment, 'exp.rerun', 'never') if rerun_condition == 'never': no_rerun_msg = "Will not run benchmark, reason: log file exists, exp.rerun='never'" elif rerun_condition == 'onfail': bench_status = BenchData.status( experiment['exp.log_file']) if bench_status == 'ok': no_rerun_msg = "Will not run benchmark, reason: log file exists, exp.status='ok', "\ "exp.rerun='onfail'" if no_rerun_msg is not None: logging.info(no_rerun_msg) progress_tracker.report(experiment['exp.log_file'], exec_status='skipped', bench_status=bench_status) continue # Track current progress progress_tracker.report_active( DictUtils.get(experiment, 'exp.log_file', '<none>')) # Get script that runs experiment for this framework. If no 'framework_family' is # found, we can try to use exp.framework. framework_key = 'exp.framework_family' if framework_key not in experiment: framework_key = 'exp.framework' command = [experiment['%s.launcher' % (experiment[framework_key])]] # Do we need to manipulate arguments for launching process? launcher_args_key = '%s.launcher_args' % experiment[framework_key] if launcher_args_key in experiment: launcher_args = set(experiment[launcher_args_key].split(' ')) logging.debug( 'Only these arguments will be passed to launching process (%s): %s', command[0], str(launcher_args)) else: launcher_args = None for param, param_val in experiment.items(): if launcher_args is not None and param not in launcher_args: continue if isinstance(param_val, list): raise ValueError( "Here, this must not be the list but (%s=%s)" % (param, str(param_val))) if not isinstance(param_val, bool): command.extend([ '--%s' % (param.replace('.', '_')), ParamUtils.to_string(param_val) ]) else: command.extend([ '--%s' % (param.replace('.', '_')), ('true' if param_val else 'false') ]) # Prepare environmental variables env_vars = copy.deepcopy(os.environ) env_vars.update( DictUtils.filter_by_key_prefix(experiment, 'runtime.env.', remove_prefix=True)) # Run experiment in background and wait for complete worker = Worker(command, env_vars, experiment) worker.work(resource_monitor) # Print progress progress_tracker.report(experiment['exp.log_file'], exec_status='completed') if progress_tracker.num_completed_benchmarks() % 10 == 0: print("Done %d benchmarks out of %d" % (progress_tracker.num_completed_benchmarks(), num_active_experiments)) sys.stdout.flush() # All benchmarks have been conducted. if resource_monitor is not None: resource_monitor.stop() progress_tracker.report_all_completed() progress_tracker.print_summary()