def check_nvidia_smi(exit_if_fail=False, debug=False): """ Make sure nvidia-smi runs fast enough to perform GPU utilization sampling. :return: """ start_t = time.time() # $ nvidia-smi smi_output = run_nvidia_smi(debug=debug) end_t = time.time() nvidia_smi_sec = end_t - start_t if nvidia_smi_sec > MAX_NVIDIA_SMI_TIME_SEC: # $ sudo service nvidia-persistenced start errmsg = textwrap.dedent(""" RL-Scope WARNING: nvidia-smi takes a long time to run on your system. In particular, it took {sec} sec to run nvidia-smi (we would prefer < {limit_sec}). This will interfere with sampling GPU utilization. You can fix this by running the following command: # Start systemd nvidia-persistenced service (if it's not already running). $ sudo nvidia-persistenced --persistence-mode For more details see: https://devtalk.nvidia.com/default/topic/1011192/nvidia-smi-is-slow-on-ubuntu-16-04-/ """).format( sec=nvidia_smi_sec, limit_sec=MAX_NVIDIA_SMI_TIME_SEC, ) if exit_if_fail: logger.error(errmsg) sys.exit(1) else: logger.warning(errmsg)
def register_wrap_module(wrap_module, unwrap_module): wrapped_module = WrappedModule(wrap_module, unwrap_module) WRAPPED_MODULES.append(wrapped_module) if _LIBS_WRAPPED: logger.warning( f"Registering wrapped module late for {wrap_module.__name__}; you will miss recording calls that happpened before this point" ) wrapped_module.wrap()
def _add_rlscope_args(args): if not rlscope_api.is_used() and not get_arg(args, 'rlscope_disable'): logger.warning( textwrap.dedent("""\ Skipping RL-Scope profiling; to run with RL-Scope prefix your command with: $ rls-prof ... -------- """).rstrip()) set_arg(args, 'rlscope_disable', True)
def pdf2svg(path, output=None, can_skip=False): if output is None: output = re.sub(r'\.pdf$', '.svg', path) # If this fails, then "path" doesn't end with .pdf. assert path != output if not shutil.which('pdf2svg'): if can_skip: logger.warning(f"pdf2svg shell command not found; SKIP: pdf2svg {path} {output}") return else: raise RuntimeError(f"pdf2svg shell command not found for: \"pdf2svg {path} {output}\". Install with \"sudo apt install pdf2svg\"") subprocess.check_call(['pdf2svg', # input path, output, ])
def dump_plot_index_py(self): cmd = sys.argv if not self.dry_run: src = _j(py_config.ROOT, 'rlscope/scripts/rlscope_plot_index.py') dst = _j(self.out_dir, 'rlscope_plot_index.py') logger.info("cp {src} -> {dst}".format(src=src, dst=dst)) os.makedirs(_d(dst), exist_ok=True) shutil.copyfile(src, dst) os.makedirs(_d(self.plot_index_path), exist_ok=True) if _e(self.plot_index_path) and not self.replace: logger.warning( "{path} exists; skipping".format(path=self.plot_index_path)) return with open(self.plot_index_path, 'w') as f: contents = textwrap.dedent("""\ #!/usr/bin/env python3 ### GENERATED FILE; do NOT modify! ### generated using: ### CMD: ### {cmd} ### PWD: ### {pwd} DIRECTORY = "{dir}" INDEX = \\ {index} """).format( dir=os.path.realpath(self.directory), index=textwrap.indent(pprint.pformat(self.index), prefix=" "), cmd=" ".join(cmd), pwd=os.getcwd(), ) if self.debug: logger.info("> Generated file: {path}".format( path=self.plot_index_path)) logger.info(contents) if not self.dry_run: f.write(contents)
def pdf2png(path, output=None, can_skip=True, silent=True): if not is_pdf(path) and can_skip: return if output is None: output = re.sub(r'\.pdf$', '.png', path) # If this fails, then "path" doesn't end with .pdf. assert path != output if not shutil.which('pdftoppm'): if can_skip: if not silent: logger.warning(f"pdftoppm shell command not found; SKIP: \"pdftoppm {path} {output}\"") return raise RLScopeConfigurationError(f"pdftoppm shell command not found for: \"pdftoppm {path} {output}\". Install with \"sudo apt install poppler-utils\"") with open(output, 'wb') as f: subprocess.check_call([ 'pdftoppm', # input path, '-png', # first page '-f', '1', # single page pdf '-singlefile', ], stdout=f)
def mode_run_sh(self): # Fill queue with commands to run. run_commands = self.run_commands() for run_cmd in run_commands: logger.debug(f"Put: {run_cmd}") self.cmd_queue.put(run_cmd) self.start_gpu_workers() bar = None if self.should_show_progress: bar = progressbar.ProgressBar(max_value=len(run_commands)) last_completed = None # Wait for workers to terminate try: while True: if self.should_show_progress: completed = len(run_commands) - self.cmd_queue.qsize() if last_completed is None or completed > last_completed: bar.update(completed) last_completed = completed if self.worker_failed.is_set(): self.stop_workers() # ; use --skip-errors to ignore failed commands. if not self.skip_final_error_message: logger.error("At least one command failed with non-zero exit status") if self.should_show_progress: bar.finish() sys.exit(1) alive_workers = 0 failed_workers = 0 for gpu, worker in self.gpu_workers.items(): if worker.is_alive(): alive_workers += 1 continue if worker.exitcode < 0: logger.error("GPU[{gpu}] worker failed with exitcode={ret} (unhandled exception)".format( gpu=gpu, ret=worker.exitcode, )) self.worker_failed.set() failed_workers += 1 if failed_workers > 0: self.stop_workers() if self.should_show_progress: bar.finish() sys.exit(1) if alive_workers == 0: if self.cmd_queue.qsize() > 0: logger.warning("GPU workers have finished with {len} remaining commands unfinished".format( len=self.cmd_queue.qsize() )) sys.exit(1) logger.debug("GPU workers have finished successfully".format( len=self.cmd_queue.qsize() )) if self.should_show_progress: bar.finish() sys.exit(0) time.sleep(2) except KeyboardInterrupt: logger.info("Saw Ctrl-C; waiting for workers to terminate") self.stop_workers() logger.warning("{len} remaining commands went unprocessed".format(len=self.cmd_queue.qsize())) if self.should_show_progress: bar.finish() sys.exit(1)
NOTE: We don't bother to wrap export_saved_model, simply because it doesn't expose any convenient hooks... but we COULD wrap it (minigo doesn't use it). """ from rlscope.profiler.rlscope_logging import logger from rlscope import py_config SKIP_MODULE = False if py_config.is_running_unit_tests(): try: import tensorflow from tensorflow.python.training.training import CheckpointSaverListener except ImportError: SKIP_MODULE = True logger.warning( "SKIP import of {path} during pytest; TensorFlow not installed". format(path=__file__, )) # Python cannot "return" from a module, hence the giant if statement. if not SKIP_MODULE: import tensorflow as tf import rlscope from tensorflow.python.training.training import CheckpointSaverListener """ Wrap tf.estimator.Estimator. """ old_Estimator = None def setup_wrap_Estimator():
def _sel_all(selector, sel_order, level, md, subtree, skip_missing_fields=False, debug=False): """ Given a subtree key-ed like: subtree = { sel_order[level]: { <sel_order[level] value>: ... } } Recursively iterate according to sel_order[i], using selector to decide which subtrees to visit at each level. :param selector: :param idx: A subtree of the INDEX, where: keys = values of type 'sel_field' Initially when _sel is first called, the idx is INDEX, and the subtree is key-ed by plot-type (e.g. ResourceSubplot). :param sel_field: Initially when _sel is first called, sel_field = 'overlap_type'. :return: """ if debug: logger.debug(f"level = {level}") while True: if level == len(sel_order): yield dict(md), subtree return field = sel_order[level] if field not in subtree and skip_missing_fields: # Subtree is missing field, but there's only one choice of field-value to use. logger.warning("Skipping field={field} since it is missing".format( field=field)) level += 1 elif field in subtree: break else: raise RuntimeError( "Didn't find field={field} in selector; options are {fields}". format( field=field, fields=sorted(subtree.keys()), )) for value, next_subtree in _sel(selector, subtree[field], field, skip_missing_fields=skip_missing_fields, debug=debug): md[field] = value for md, entry in _sel_all(selector, sel_order, level + 1, md, next_subtree, skip_missing_fields=skip_missing_fields, debug=debug): yield md, entry
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent("""\ Process trace-files collected from running an ML script with the RL-Scope profiler. For task-specific help, provided task-name and --help, e.g.: $ rls-run --task OverlapStackedBarTask --help NOTE: - This script is a thin usage/debugging wrapper around a "luigi" DAG execution script. It just forwards arguments to it. - Any unparsed args are forward to the luigi script. """), formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument('--pdb', action='store_true', help="Break into pdb when an exception occurs") parser.add_argument('--task', choices=[klass.__name__ for klass in tasks.RLSCOPE_TASKS], help="Name of a runnable IMLTask defined in rlscope.parser.tasks") parser.add_argument('--workers', type=int, # DISABLE --workers for now to prevent opening to many postgres connections by accident; # we parallelize internally instead # e.g. ResourceOverlap with 32 worker threads, each of which opens a SQL # connection. # default=multiprocessing.cpu_count(), default=1, help="Maximum number of parallel tasks to run (luigi parameter)") parser.add_argument('--help', '-h', action='store_true') args, luigi_argv = parser.parse_known_args(sys.argv) if args.help and not args.task: # Print available tasks. parser.print_help() sys.exit(0) if args.task is None and not args.help: # If they just run this: # $ rls-run --rlscope-directory <dir> # Then run all the targets. args.task = 'All' extra_argv = [ '--module', 'rlscope.parser.tasks', '--local-scheduler', # Default log-level from luigi is DEBUG which is too noisy. # Make the default level INFO instead. '--log-level', 'INFO', ] luigi_argv.extend(extra_argv) if args.task: # Task needs to be the first argument after rls-run. luigi_argv.insert(1, args.task) if args.help: luigi_argv.extend(['--help']) if args.workers > 1: logger.warning("Each overlap plot uses all the cores; forcing --workers=1") args.workers = 1 if args.pdb: logger.debug("Registering pdb breakpoint (--pdb)") register_pdb_breakpoint() # Debugger is useless when multithreaded. args.workers = 1 luigi_argv.extend(['--workers', str(args.workers)]) # logger.debug("Luigi arguments:\n{msg}".format(msg=textwrap.indent(pprint.pformat({ # 'luigi_argv':luigi_argv, # 'sys.argv':sys.argv, # }), prefix=' '))) with warnings.catch_warnings(): # I don't really take much advantage of luigi's DFS scheduler and instead run things manually. # Oh well. warnings.filterwarnings('ignore', category=UserWarning, message=r'.*without outputs has no custom complete', module=r'luigi') warnings.filterwarnings('ignore', category=UserWarning, message=r'Parameter.*with value "None" is not of type string', module=r'luigi') tasks.main(argv=luigi_argv[1:], should_exit=False)