def register_torch(): try: import torch register_wrap_module(wrap_torch, unwrap_torch) except ImportError: if py_config.DEBUG_WRAP_CLIB: logger.debug("torch is NOT installed")
def wrap_func(FuncWrapperKlass, module, name, wrapper_args=None, func_regex=None, ignore_func_regex="^_", should_wrap=None): # for name in dir(module): if wrapper_args is None: wrapper_args = _EMPTY_ARGS if, name): if py_config.DEBUG_WRAP_CLIB:" Skip func={name}".format(name=name)) return False func = getattr(module, name) if type(func) == FuncWrapperKlass or not callable(func): return False if func_regex is not None and not, name): return False if should_wrap is not None and not should_wrap(name, func): return False if inspect.isclass(func) or inspect.ismodule(func): if py_config.DEBUG: logger.debug( "Cannot wrap {module}.{name} since it's not a function: {value}" .format( module=module.__name__, name=name, value=func, )) return False func_wrapper = FuncWrapperKlass(func, *wrapper_args) setattr(module, name, func_wrapper) return True
def write(self, f, txt, lstrip=True): txt_formatted = textwrap.dedent(txt) if lstrip: txt_formatted = txt_formatted.lstrip() if self.debug: logger.debug("Write to {f}:\n{txt}".format( txt=textwrap.indent(txt_formatted.rstrip(), prefix=' '),, )) f.write(txt_formatted)
def register_detected_libs(): try: import atari_py register_wrap_module(wrap_atari, unwrap_atari) except ImportError: pass try: import tensorflow if is_tensorflow_v2(): if py_config.DEBUG_WRAP_CLIB: logger.debug("Detected TensorFlow v2") register_wrap_module(wrap_tensorflow_v2, unwrap_tensorflow_v2) else: if py_config.DEBUG_WRAP_CLIB: logger.debug("Detected TensorFlow v1") register_wrap_module(wrap_tensorflow_v1, unwrap_tensorflow_v1) except ImportError: if py_config.DEBUG_WRAP_CLIB: logger.debug("TensorFlow is NOT installed") try: import pybullet register_wrap_module(wrap_pybullet, unwrap_pybullet) except ImportError: if py_config.DEBUG_WRAP_CLIB: logger.debug("PyBullet is NOT installed") # NOTE: we delay this to avoid messing up torch.jit.script until we support it better. register_torch()
def _gpu_worker(self, gpu, *args, **kwargs): try: self.gpu_worker(gpu, *args, **kwargs) except KeyboardInterrupt: logger.debug(f"GPU[{gpu}] worker saw Ctrl-C; exiting early") return except Exception as e: logger.error(textwrap.dedent("""\ BUG: GPU[{gpu}] worker failed with unhandled exception: {error} """).format( gpu=gpu, error=textwrap.indent(traceback.format_exc(), prefix=' '), ).rstrip()) sys.exit(1)
def gpu_worker(self, gpu): # :\n{msg} logger.debug("Start GPU[{gpu}] worker; queue contains {len} items".format( len=self.cmd_queue.qsize(), gpu=gpu, # msg=textwrap.indent(pprint.pformat(list(self.cmd_queue)), prefix=' '), )) while True: run_cmd = None try: # run_cmd = self.cmd_queue.get(block=False) logger.debug(f"Get: GPU[{gpu}] worker...") # This hangs forever when timeout isn't provided... # run_cmd = self.cmd_queue.get() run_cmd = self.cmd_queue.get(timeout=1) except queue.Empty: logger.debug(f"No more items in queue; exiting GPU[{gpu}] worker") break # except socket.timeout: # logger.debug(f"Saw timeout; no more items in queue (len={self.cmd_queue.qsize()})? Exiting GPU[{gpu}] worker") # assert self.cmd_queue.qsize() == 0 # break logfile = run_cmd.logfile logger.debug( textwrap.dedent("""\ GPU[{gpu}] worker running command (queue size = {len}): > CMD: logfile={logfile} $ {cmd} """).format( len=self.cmd_queue.qsize(), cmd=' '.join(run_cmd.cmd), gpu=gpu, logfile=logfile, ).rstrip()) if self.retry is None: retries = 1 else: retries = self.retry for attempt in range(1, retries+1): proc = self.run_cmd(gpu, run_cmd, tee_output=self.tee or self.should_show_output, tee_prefix=f"GPU[{gpu}] :: ", attempt=attempt) if self.dry_run or proc is None or proc.returncode == 0: break if self.should_stop.is_set() or self.worker_failed.is_set(): logger.debug(f"Got exit signal; exiting GPU[{gpu}] worker") break
def autodoc_skip_member_handler(app, what, name, obj, skip, options): # Basic approach; you might want a regex instead # import pdb; pdb.set_trace() if'^test_|^Test|^rlscope_plot_index\.py', name): if DEBUG_SPHINX: logger.debug("SKIP: {name} {msg}".format( name=name, msg=pprint_msg(locals()))) return True if DEBUG_SPHINX: logger.debug("DOCS: {name} {msg}".format( name=name, msg=pprint_msg(locals()))) # Use "default" handling behaviour of whether to skip docs for this symbol. # NOTE: we don't return False since it results in weird symbols like # __dict__ getting documented. return None
def record_event(category, start_us, duration_us, name): if py_config.DEBUG and py_config.DEBUG_RLSCOPE_LIB_CALLS: _log_api_call_msg('record_event', category, start_us, duration_us, name)) if py_config.DEBUG and duration_us < 0: logger.debug( "BUG: recorded event with negative duration: Event(category={category}, name={name}, start_us={start_us}, dur_us={duration_us})" .format( category=category, start_us=start_us, duration_us=duration_us, name=name, )) ret = _so.rlscope_record_event( _as_c_string(category), c_int64(int(start_us)), c_int64(int(duration_us)), _as_c_string(name), ) if ret != TF_OK: raise RLScopeLibraryError(ret) return ret
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent(__doc__.lstrip().rstrip()), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--run", action='store_true', help=textwrap.dedent("""\ Run the command as-is. """)) parser.add_argument("--append", action='store_true', help=textwrap.dedent("""\ Append the command to --sh """)) parser.add_argument("--sh", help=textwrap.dedent("""\ Shell file to append commands to (see --append). """)) parser.add_argument('--run-sh', action='store_true', help=textwrap.dedent("""\ Run all the commands in --sh on the available --gpus """)) parser.add_argument('--rlscope-directory', help=textwrap.dedent("""\ The output directory of the command being run. This is where logfile.out will be output. """)) parser.add_argument("--verbosity", choices=['progress', 'commands', 'output'], default='progress', help=textwrap.dedent("""\ Output information about running commands. --verbosity progress (Default) Only show high-level progress bar information. --verbosity commands Show the command-line of commands that are being run. --verbosity output Show the output of each analysis (not configuration) command on sys.stdout. NOTE: This may cause interleaving of lines. """)) parser.add_argument('--line-numbers', action='store_true', help=textwrap.dedent("""\ Show line numbers and timestamps in RL-Scope logging messages. """)) parser.add_argument('--debug', action='store_true', help=textwrap.dedent("""\ Debug """)) parser.add_argument('--skip-final-error-message', action='store_true', help=textwrap.dedent("""\ Skip error message printed at the end if at least one command fails. """)) parser.add_argument("--retry", type=int, help=textwrap.dedent("""\ If a command fails, retry it up to --retry times. Default: don't retry. """)) parser.add_argument("--tee", action='store_true', help=textwrap.dedent("""\ (debug) tee output of parallel processes to stdout (prefix output with worker name) """)) parser.add_argument("--pdb", action='store_true', help=textwrap.dedent("""\ Debug """)) parser.add_argument('--dry-run', action='store_true', help=textwrap.dedent("""\ Dry run """)) parser.add_argument('--skip-errors', action='store_true', help=textwrap.dedent("""\ If a command fails, ignore the failure and continue running other commands. """)) parser.add_argument("--gpus", default='all', help=textwrap.dedent("""\ # Run on the first GPU only --gpus 0 # Run on the first 2 GPUs --gpus 0,1 # Run on all available GPUs --gpus all # Don't allow running with any GPUs (CUDA_VISIBLE_DEVICES="") --gpus none """)) all_args, _ = parser.parse_known_args(sys.argv) ignore_opts = set() if is not None: ignore_opts.add( run_expr_argv, cmd = gather_argv( sys.argv[1:], ignore_opts=ignore_opts) args = parser.parse_args(run_expr_argv) if args.debug: logger.debug({ 'run_expr_argv': run_expr_argv, 'cmd': cmd, }) rlscope_logging.setup_logger( debug=args.debug, line_numbers=args.debug or args.line_numbers or py_config.is_development_mode(), ) if is None and ( args.run_sh or args.append ): error("--sh is required when either --run-sh or --append are given", parser=parser) if args.run_sh and ( args.append or ): error("When --run-sh is given, you cannot provide either --append or --run", parser=parser) available_gpus = get_available_gpus() if args.gpus == 'all': gpus = sorted([gpu['device_number'] for gpu in available_gpus]) elif args.gpus.lower() == 'none': args.gpus = [None] else: try: gpus = sorted([int(gpu) for gpu in re.split(r',', args.gpus)]) except ValueError: error("Failed to parser --gpus={gpus}".format(gpus=args.gpus), parser=parser) assert len(gpus) >= 1 if or args.append: if len(cmd) == 0: error("Expected cmd to run in arguments, but none was provided", parser=parser) if shutil.which(cmd[0]) is None: error("Couldn't find {exec} on PATH".format( exec=cmd[0]), parser=parser) if all_args.rlscope_directory is None: # No --rlscope-directory argument; just use current directory? args.rlscope_directory = os.getcwd() else: args.rlscope_directory = all_args.rlscope_directory # # error("\n {cmd}".format(cmd=' '.join(cmd))) # error(textwrap.dedent("""\ # --rlscope-directory must be provided so we know where to output logfile.out for cmd: # > CMD: # $ {cmd} # """).format( # cmd=' '.join(cmd), # ).rstrip()) # # "Copy" --rlscope-directory argument from cmd. # args.rlscope_directory = all_args.rlscope_directory args_dict = dict(vars(args)) args_dict.pop('gpus') args_dict.pop('pdb') obj = RunExpr( cmd=cmd, gpus=gpus, **args_dict, ) def _run(): obj.run_program() run_with_pdb(args, _run)
def stop_workers(self): self.should_stop.set() for gpu, worker in self.gpu_workers.items(): logger.debug(f"Wait for GPU[{gpu}] worker...") worker.join()
def mode_run_sh(self): # Fill queue with commands to run. run_commands = self.run_commands() for run_cmd in run_commands: logger.debug(f"Put: {run_cmd}") self.cmd_queue.put(run_cmd) self.start_gpu_workers() bar = None if self.should_show_progress: bar = progressbar.ProgressBar(max_value=len(run_commands)) last_completed = None # Wait for workers to terminate try: while True: if self.should_show_progress: completed = len(run_commands) - self.cmd_queue.qsize() if last_completed is None or completed > last_completed: bar.update(completed) last_completed = completed if self.worker_failed.is_set(): self.stop_workers() # ; use --skip-errors to ignore failed commands. if not self.skip_final_error_message: logger.error("At least one command failed with non-zero exit status") if self.should_show_progress: bar.finish() sys.exit(1) alive_workers = 0 failed_workers = 0 for gpu, worker in self.gpu_workers.items(): if worker.is_alive(): alive_workers += 1 continue if worker.exitcode < 0: logger.error("GPU[{gpu}] worker failed with exitcode={ret} (unhandled exception)".format( gpu=gpu, ret=worker.exitcode, )) self.worker_failed.set() failed_workers += 1 if failed_workers > 0: self.stop_workers() if self.should_show_progress: bar.finish() sys.exit(1) if alive_workers == 0: if self.cmd_queue.qsize() > 0: logger.warning("GPU workers have finished with {len} remaining commands unfinished".format( len=self.cmd_queue.qsize() )) sys.exit(1) logger.debug("GPU workers have finished successfully".format( len=self.cmd_queue.qsize() )) if self.should_show_progress: bar.finish() sys.exit(0) time.sleep(2) except KeyboardInterrupt:"Saw Ctrl-C; waiting for workers to terminate") self.stop_workers() logger.warning("{len} remaining commands went unprocessed".format(len=self.cmd_queue.qsize())) if self.should_show_progress: bar.finish() sys.exit(1)
def _sel_all(selector, sel_order, level, md, subtree, skip_missing_fields=False, debug=False): """ Given a subtree key-ed like: subtree = { sel_order[level]: { <sel_order[level] value>: ... } } Recursively iterate according to sel_order[i], using selector to decide which subtrees to visit at each level. :param selector: :param idx: A subtree of the INDEX, where: keys = values of type 'sel_field' Initially when _sel is first called, the idx is INDEX, and the subtree is key-ed by plot-type (e.g. ResourceSubplot). :param sel_field: Initially when _sel is first called, sel_field = 'overlap_type'. :return: """ if debug: logger.debug(f"level = {level}") while True: if level == len(sel_order): yield dict(md), subtree return field = sel_order[level] if field not in subtree and skip_missing_fields: # Subtree is missing field, but there's only one choice of field-value to use. logger.warning("Skipping field={field} since it is missing".format( field=field)) level += 1 elif field in subtree: break else: raise RuntimeError( "Didn't find field={field} in selector; options are {fields}". format( field=field, fields=sorted(subtree.keys()), )) for value, next_subtree in _sel(selector, subtree[field], field, skip_missing_fields=skip_missing_fields, debug=debug): md[field] = value for md, entry in _sel_all(selector, sel_order, level + 1, md, next_subtree, skip_missing_fields=skip_missing_fields, debug=debug): yield md, entry
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent("""\ Process trace-files collected from running an ML script with the RL-Scope profiler. For task-specific help, provided task-name and --help, e.g.: $ rls-run --task OverlapStackedBarTask --help NOTE: - This script is a thin usage/debugging wrapper around a "luigi" DAG execution script. It just forwards arguments to it. - Any unparsed args are forward to the luigi script. """), formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument('--pdb', action='store_true', help="Break into pdb when an exception occurs") parser.add_argument('--task', choices=[klass.__name__ for klass in tasks.RLSCOPE_TASKS], help="Name of a runnable IMLTask defined in rlscope.parser.tasks") parser.add_argument('--workers', type=int, # DISABLE --workers for now to prevent opening to many postgres connections by accident; # we parallelize internally instead # e.g. ResourceOverlap with 32 worker threads, each of which opens a SQL # connection. # default=multiprocessing.cpu_count(), default=1, help="Maximum number of parallel tasks to run (luigi parameter)") parser.add_argument('--help', '-h', action='store_true') args, luigi_argv = parser.parse_known_args(sys.argv) if and not args.task: # Print available tasks. parser.print_help() sys.exit(0) if args.task is None and not # If they just run this: # $ rls-run --rlscope-directory <dir> # Then run all the targets. args.task = 'All' extra_argv = [ '--module', 'rlscope.parser.tasks', '--local-scheduler', # Default log-level from luigi is DEBUG which is too noisy. # Make the default level INFO instead. '--log-level', 'INFO', ] luigi_argv.extend(extra_argv) if args.task: # Task needs to be the first argument after rls-run. luigi_argv.insert(1, args.task) if luigi_argv.extend(['--help']) if args.workers > 1: logger.warning("Each overlap plot uses all the cores; forcing --workers=1") args.workers = 1 if args.pdb: logger.debug("Registering pdb breakpoint (--pdb)") register_pdb_breakpoint() # Debugger is useless when multithreaded. args.workers = 1 luigi_argv.extend(['--workers', str(args.workers)]) # logger.debug("Luigi arguments:\n{msg}".format(msg=textwrap.indent(pprint.pformat({ # 'luigi_argv':luigi_argv, # 'sys.argv':sys.argv, # }), prefix=' '))) with warnings.catch_warnings(): # I don't really take much advantage of luigi's DFS scheduler and instead run things manually. # Oh well. warnings.filterwarnings('ignore', category=UserWarning, message=r'.*without outputs has no custom complete', module=r'luigi') warnings.filterwarnings('ignore', category=UserWarning, message=r'Parameter.*with value "None" is not of type string', module=r'luigi') tasks.main(argv=luigi_argv[1:], should_exit=False)