コード例 #1
0
ファイル: clib_wrap.py プロジェクト: Zhaojp-Frank/rlscope
def register_torch():
    try:
        import torch
        register_wrap_module(wrap_torch, unwrap_torch)
    except ImportError:
        if py_config.DEBUG_WRAP_CLIB:
            logger.debug("torch is NOT installed")
コード例 #2
0
ファイル: wrap_util.py プロジェクト: Zhaojp-Frank/rlscope
def wrap_func(FuncWrapperKlass,
              module,
              name,
              wrapper_args=None,
              func_regex=None,
              ignore_func_regex="^_",
              should_wrap=None):
    # for name in dir(module):
    if wrapper_args is None:
        wrapper_args = _EMPTY_ARGS
    if re.search(ignore_func_regex, name):
        if py_config.DEBUG_WRAP_CLIB:
            logger.info("  Skip func={name}".format(name=name))
        return False
    func = getattr(module, name)
    if type(func) == FuncWrapperKlass or not callable(func):
        return False
    if func_regex is not None and not re.search(func_regex, name):
        return False
    if should_wrap is not None and not should_wrap(name, func):
        return False
    if inspect.isclass(func) or inspect.ismodule(func):
        if py_config.DEBUG:
            logger.debug(
                "Cannot wrap {module}.{name} since it's not a function: {value}"
                .format(
                    module=module.__name__,
                    name=name,
                    value=func,
                ))
        return False

    func_wrapper = FuncWrapperKlass(func, *wrapper_args)
    setattr(module, name, func_wrapper)
    return True
コード例 #3
0
 def write(self, f, txt, lstrip=True):
     txt_formatted = textwrap.dedent(txt)
     if lstrip:
         txt_formatted = txt_formatted.lstrip()
     if self.debug:
         logger.debug("Write to {f}:\n{txt}".format(
             txt=textwrap.indent(txt_formatted.rstrip(), prefix='  '),
             f=f.name,
         ))
     f.write(txt_formatted)
コード例 #4
0
ファイル: clib_wrap.py プロジェクト: Zhaojp-Frank/rlscope
def register_detected_libs():
    try:
        import atari_py
        register_wrap_module(wrap_atari, unwrap_atari)
    except ImportError:
        pass

    try:
        import tensorflow
        if is_tensorflow_v2():
            if py_config.DEBUG_WRAP_CLIB:
                logger.debug("Detected TensorFlow v2")
            register_wrap_module(wrap_tensorflow_v2, unwrap_tensorflow_v2)
        else:
            if py_config.DEBUG_WRAP_CLIB:
                logger.debug("Detected TensorFlow v1")
            register_wrap_module(wrap_tensorflow_v1, unwrap_tensorflow_v1)
    except ImportError:
        if py_config.DEBUG_WRAP_CLIB:
            logger.debug("TensorFlow is NOT installed")

    try:
        import pybullet
        register_wrap_module(wrap_pybullet, unwrap_pybullet)
    except ImportError:
        if py_config.DEBUG_WRAP_CLIB:
            logger.debug("PyBullet is NOT installed")

    # NOTE: we delay this to avoid messing up torch.jit.script until we support it better.
    register_torch()
コード例 #5
0
 def _gpu_worker(self, gpu, *args, **kwargs):
     try:
         self.gpu_worker(gpu, *args, **kwargs)
     except KeyboardInterrupt:
         logger.debug(f"GPU[{gpu}] worker saw Ctrl-C; exiting early")
         return
     except Exception as e:
         logger.error(textwrap.dedent("""\
         BUG: GPU[{gpu}] worker failed with unhandled exception:
         {error}
         """).format(
             gpu=gpu,
             error=textwrap.indent(traceback.format_exc(), prefix='  '),
         ).rstrip())
         sys.exit(1)
コード例 #6
0
    def gpu_worker(self, gpu):
        # :\n{msg}
        logger.debug("Start GPU[{gpu}] worker; queue contains {len} items".format(
            len=self.cmd_queue.qsize(),
            gpu=gpu,
            # msg=textwrap.indent(pprint.pformat(list(self.cmd_queue)), prefix='  '),
        ))
        while True:
            run_cmd = None
            try:
                # run_cmd = self.cmd_queue.get(block=False)
                logger.debug(f"Get: GPU[{gpu}] worker...")
                # This hangs forever when timeout isn't provided...
                # run_cmd = self.cmd_queue.get()
                run_cmd = self.cmd_queue.get(timeout=1)
            except queue.Empty:
                logger.debug(f"No more items in queue; exiting GPU[{gpu}] worker")
                break
            # except socket.timeout:
            #     logger.debug(f"Saw timeout; no more items in queue (len={self.cmd_queue.qsize()})? Exiting GPU[{gpu}] worker")
            #     assert self.cmd_queue.qsize() == 0
            #     break

            logfile = run_cmd.logfile
            logger.debug(
                textwrap.dedent("""\
                    GPU[{gpu}] worker running command (queue size = {len}):
                    > CMD:
                      logfile={logfile}
                      $ {cmd}
                    """).format(
                    len=self.cmd_queue.qsize(),
                    cmd=' '.join(run_cmd.cmd),
                    gpu=gpu,
                    logfile=logfile,
                ).rstrip())
            if self.retry is None:
                retries = 1
            else:
                retries = self.retry
            for attempt in range(1, retries+1):
                proc = self.run_cmd(gpu, run_cmd, tee_output=self.tee or self.should_show_output, tee_prefix=f"GPU[{gpu}] :: ", attempt=attempt)
                if self.dry_run or proc is None or proc.returncode == 0:
                    break

            if self.should_stop.is_set() or self.worker_failed.is_set():
                logger.debug(f"Got exit signal; exiting GPU[{gpu}] worker")
                break
コード例 #7
0
def autodoc_skip_member_handler(app, what, name, obj, skip, options):
    # Basic approach; you might want a regex instead
    # import pdb; pdb.set_trace()
    if re.search(r'^test_|^Test|^rlscope_plot_index\.py', name):
        if DEBUG_SPHINX:
            logger.debug("SKIP: {name} {msg}".format(
                name=name,
                msg=pprint_msg(locals())))
        return True
    if DEBUG_SPHINX:
        logger.debug("DOCS: {name} {msg}".format(
            name=name,
            msg=pprint_msg(locals())))
    # Use "default" handling behaviour of whether to skip docs for this symbol.
    # NOTE: we don't return False since it results in weird symbols like
    # __dict__ getting documented.
    return None
コード例 #8
0
ファイル: rlscope_api.py プロジェクト: Zhaojp-Frank/rlscope
def record_event(category, start_us, duration_us, name):
    if py_config.DEBUG and py_config.DEBUG_RLSCOPE_LIB_CALLS:
        logger.info(
            _log_api_call_msg('record_event', category, start_us, duration_us,
                              name))
    if py_config.DEBUG and duration_us < 0:
        logger.debug(
            "BUG: recorded event with negative duration: Event(category={category}, name={name}, start_us={start_us}, dur_us={duration_us})"
            .format(
                category=category,
                start_us=start_us,
                duration_us=duration_us,
                name=name,
            ))
    ret = _so.rlscope_record_event(
        _as_c_string(category),
        c_int64(int(start_us)),
        c_int64(int(duration_us)),
        _as_c_string(name),
    )
    if ret != TF_OK:
        raise RLScopeLibraryError(ret)
    return ret
コード例 #9
0
def main():

    try:
        check_host.check_config()
    except RLScopeConfigurationError as e:
        logger.error(e)
        sys.exit(1)

    parser = argparse.ArgumentParser(
        description=textwrap.dedent(__doc__.lstrip().rstrip()),
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("--run",
                        action='store_true',
                        help=textwrap.dedent("""\
                        Run the command as-is.
                        """))
    parser.add_argument("--append",
                        action='store_true',
                        help=textwrap.dedent("""\
                        Append the command to --sh
                        """))
    parser.add_argument("--sh",
                        help=textwrap.dedent("""\
                        Shell file to append commands to (see --append).
                        """))
    parser.add_argument('--run-sh',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Run all the commands in --sh on the available --gpus
                        """))
    parser.add_argument('--rlscope-directory',
                        help=textwrap.dedent("""\
                        The output directory of the command being run.
                        This is where logfile.out will be output.
                        """))
    parser.add_argument("--verbosity",
                        choices=['progress', 'commands', 'output'],
                        default='progress',
                        help=textwrap.dedent("""\
                            Output information about running commands.
                            --verbosity progress (Default)
                                Only show high-level progress bar information.
                              
                            --verbosity commands
                                Show the command-line of commands that are being run.
                                
                            --verbosity output
                                Show the output of each analysis (not configuration) command on sys.stdout.
                                NOTE: This may cause interleaving of lines.
                            """))
    parser.add_argument('--line-numbers', action='store_true', help=textwrap.dedent("""\
    Show line numbers and timestamps in RL-Scope logging messages.
    """))
    parser.add_argument('--debug',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Debug
                        """))
    parser.add_argument('--skip-final-error-message',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Skip error message printed at the end if at least one command fails.
                        """))
    parser.add_argument("--retry",
                        type=int,
                        help=textwrap.dedent("""\
                            If a command fails, retry it up to --retry times.
                            Default: don't retry.
                            """))
    parser.add_argument("--tee",
                        action='store_true',
                        help=textwrap.dedent("""\
                        (debug)
                        tee output of parallel processes to stdout (prefix output with worker name)
                        """))
    parser.add_argument("--pdb",
                        action='store_true',
                        help=textwrap.dedent("""\
                        Debug
                        """))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Dry run
                        """))
    parser.add_argument('--skip-errors',
                        action='store_true',
                        help=textwrap.dedent("""\
                        If a command fails, ignore the failure and continue running other commands.
                        """))
    parser.add_argument("--gpus",
                        default='all',
                        help=textwrap.dedent("""\
                        # Run on the first GPU only
                        --gpus 0
                        # Run on the first 2 GPUs
                        --gpus 0,1
                        # Run on all available GPUs
                        --gpus all
                        # Don't allow running with any GPUs (CUDA_VISIBLE_DEVICES="")
                        --gpus none
                        """))
    all_args, _ = parser.parse_known_args(sys.argv)
    ignore_opts = set()
    if all_args.sh is not None:
        ignore_opts.add(all_args.sh)
    run_expr_argv, cmd = gather_argv(
        sys.argv[1:],
        ignore_opts=ignore_opts)
    args = parser.parse_args(run_expr_argv)

    if args.debug:
        logger.debug({
            'run_expr_argv': run_expr_argv,
            'cmd': cmd,
        })

    rlscope_logging.setup_logger(
        debug=args.debug,
        line_numbers=args.debug or args.line_numbers or py_config.is_development_mode(),
    )

    if args.sh is None and ( args.run_sh or args.append ):
        error("--sh is required when either --run-sh or --append are given", parser=parser)

    if args.run_sh and ( args.append or args.run ):
        error("When --run-sh is given, you cannot provide either --append or --run", parser=parser)

    available_gpus = get_available_gpus()
    if args.gpus == 'all':
        gpus = sorted([gpu['device_number'] for gpu in available_gpus])
    elif args.gpus.lower() == 'none':
        args.gpus = [None]
    else:
        try:
            gpus = sorted([int(gpu) for gpu in re.split(r',', args.gpus)])
        except ValueError:
            error("Failed to parser --gpus={gpus}".format(gpus=args.gpus), parser=parser)

    assert len(gpus) >= 1

    if args.run or args.append:
        if len(cmd) == 0:
            error("Expected cmd to run in arguments, but none was provided",
                  parser=parser)

        if shutil.which(cmd[0]) is None:
            error("Couldn't find {exec} on PATH".format(
                exec=cmd[0]), parser=parser)

    if all_args.rlscope_directory is None:
        # No --rlscope-directory argument; just use current directory?
        args.rlscope_directory = os.getcwd()
    else:
        args.rlscope_directory = all_args.rlscope_directory
    # # error("\n  {cmd}".format(cmd=' '.join(cmd)))
    # error(textwrap.dedent("""\
    # --rlscope-directory must be provided so we know where to output logfile.out for cmd:
    #   > CMD:
    #     $ {cmd}
    #   """).format(
    #   cmd=' '.join(cmd),
    # ).rstrip())
    # # "Copy" --rlscope-directory argument from cmd.
    # args.rlscope_directory = all_args.rlscope_directory

    args_dict = dict(vars(args))
    args_dict.pop('gpus')
    args_dict.pop('pdb')
    obj = RunExpr(
        cmd=cmd,
        gpus=gpus,
        **args_dict,
    )

    def _run():
        obj.run_program()
    run_with_pdb(args, _run)
コード例 #10
0
 def stop_workers(self):
     self.should_stop.set()
     for gpu, worker in self.gpu_workers.items():
         logger.debug(f"Wait for GPU[{gpu}] worker...")
         worker.join()
コード例 #11
0
    def mode_run_sh(self):
        # Fill queue with commands to run.
        run_commands = self.run_commands()
        for run_cmd in run_commands:
            logger.debug(f"Put: {run_cmd}")
            self.cmd_queue.put(run_cmd)

        self.start_gpu_workers()

        bar = None
        if self.should_show_progress:
            bar = progressbar.ProgressBar(max_value=len(run_commands))
        last_completed = None

        # Wait for workers to terminate
        try:
            while True:
                if self.should_show_progress:
                    completed = len(run_commands) - self.cmd_queue.qsize()
                    if last_completed is None or completed > last_completed:
                        bar.update(completed)
                    last_completed = completed

                if self.worker_failed.is_set():
                    self.stop_workers()
                    # ; use --skip-errors to ignore failed commands.
                    if not self.skip_final_error_message:
                        logger.error("At least one command failed with non-zero exit status")
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(1)

                alive_workers = 0
                failed_workers = 0
                for gpu, worker in self.gpu_workers.items():
                    if worker.is_alive():
                        alive_workers += 1
                        continue

                    if worker.exitcode < 0:
                        logger.error("GPU[{gpu}] worker failed with exitcode={ret} (unhandled exception)".format(
                            gpu=gpu,
                            ret=worker.exitcode,
                        ))
                        self.worker_failed.set()
                        failed_workers += 1

                if failed_workers > 0:
                    self.stop_workers()
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(1)

                if alive_workers == 0:
                    if self.cmd_queue.qsize() > 0:
                        logger.warning("GPU workers have finished with {len} remaining commands unfinished".format(
                            len=self.cmd_queue.qsize()
                        ))
                        sys.exit(1)
                    logger.debug("GPU workers have finished successfully".format(
                        len=self.cmd_queue.qsize()
                    ))
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(0)

                time.sleep(2)
        except KeyboardInterrupt:
            logger.info("Saw Ctrl-C; waiting for workers to terminate")
            self.stop_workers()
            logger.warning("{len} remaining commands went unprocessed".format(len=self.cmd_queue.qsize()))
            if self.should_show_progress:
                bar.finish()
            sys.exit(1)
コード例 #12
0
ファイル: plot_index.py プロジェクト: Zhaojp-Frank/rlscope
def _sel_all(selector,
             sel_order,
             level,
             md,
             subtree,
             skip_missing_fields=False,
             debug=False):
    """
    Given a subtree key-ed like:
    subtree = {
      sel_order[level]: {
          <sel_order[level] value>: ...
      }
    }
    Recursively iterate according to sel_order[i], using selector
    to decide which subtrees to visit at each level.

    :param selector:
    :param idx:
        A subtree of the INDEX, where:
        keys = values of type 'sel_field'

        Initially when _sel is first called, the idx is INDEX, and the subtree is key-ed by plot-type (e.g. ResourceSubplot).

    :param sel_field:
        Initially when _sel is first called, sel_field = 'overlap_type'.

    :return:
    """
    if debug:
        logger.debug(f"level = {level}")
    while True:
        if level == len(sel_order):
            yield dict(md), subtree
            return
        field = sel_order[level]

        if field not in subtree and skip_missing_fields:
            # Subtree is missing field, but there's only one choice of field-value to use.
            logger.warning("Skipping field={field} since it is missing".format(
                field=field))
            level += 1
        elif field in subtree:
            break
        else:
            raise RuntimeError(
                "Didn't find field={field} in selector; options are {fields}".
                format(
                    field=field,
                    fields=sorted(subtree.keys()),
                ))

    for value, next_subtree in _sel(selector,
                                    subtree[field],
                                    field,
                                    skip_missing_fields=skip_missing_fields,
                                    debug=debug):
        md[field] = value
        for md, entry in _sel_all(selector,
                                  sel_order,
                                  level + 1,
                                  md,
                                  next_subtree,
                                  skip_missing_fields=skip_missing_fields,
                                  debug=debug):
            yield md, entry
コード例 #13
0
def main():

    try:
        check_host.check_config()
    except RLScopeConfigurationError as e:
        logger.error(e)
        sys.exit(1)

    parser = argparse.ArgumentParser(
        description=textwrap.dedent("""\
        Process trace-files collected from running an ML script with the RL-Scope profiler.
        
        For task-specific help, provided task-name and --help, e.g.:
        $ rls-run --task OverlapStackedBarTask --help
        
        NOTE: 
        - This script is a thin usage/debugging wrapper around a "luigi" DAG execution script. 
          It just forwards arguments to it.
        - Any unparsed args are forward to the luigi script.
        """),
        formatter_class=argparse.RawTextHelpFormatter,
        add_help=False,
    )
    parser.add_argument('--pdb', action='store_true',
                        help="Break into pdb when an exception occurs")
    parser.add_argument('--task',
                        choices=[klass.__name__ for klass in tasks.RLSCOPE_TASKS],
                        help="Name of a runnable IMLTask defined in rlscope.parser.tasks")
    parser.add_argument('--workers',
                        type=int,
                        # DISABLE --workers for now to prevent opening to many postgres connections by accident;
                        # we parallelize internally instead
                        # e.g. ResourceOverlap with 32 worker threads, each of which opens a SQL
                        # connection.
                        # default=multiprocessing.cpu_count(),
                        default=1,
                        help="Maximum number of parallel tasks to run (luigi parameter)")
    parser.add_argument('--help', '-h',
                        action='store_true')
    args, luigi_argv = parser.parse_known_args(sys.argv)

    if args.help and not args.task:
        # Print available tasks.
        parser.print_help()
        sys.exit(0)

    if args.task is None and not args.help:
        # If they just run this:
        # $ rls-run --rlscope-directory <dir>
        # Then run all the targets.
        args.task = 'All'

    extra_argv = [
        '--module', 'rlscope.parser.tasks',
        '--local-scheduler',
        # Default log-level from luigi is DEBUG which is too noisy.
        # Make the default level INFO instead.
        '--log-level', 'INFO',
    ]
    luigi_argv.extend(extra_argv)
    if args.task:
        # Task needs to be the first argument after rls-run.
        luigi_argv.insert(1, args.task)

    if args.help:
        luigi_argv.extend(['--help'])

    if args.workers > 1:
        logger.warning("Each overlap plot uses all the cores; forcing --workers=1")
        args.workers = 1

    if args.pdb:
        logger.debug("Registering pdb breakpoint (--pdb)")
        register_pdb_breakpoint()
        # Debugger is useless when multithreaded.
        args.workers = 1

    luigi_argv.extend(['--workers', str(args.workers)])

    # logger.debug("Luigi arguments:\n{msg}".format(msg=textwrap.indent(pprint.pformat({
    #     'luigi_argv':luigi_argv,
    #     'sys.argv':sys.argv,
    # }), prefix='  ')))

    with warnings.catch_warnings():
        # I don't really take much advantage of luigi's DFS scheduler and instead run things manually.
        # Oh well.
        warnings.filterwarnings('ignore', category=UserWarning, message=r'.*without outputs has no custom complete', module=r'luigi')
        warnings.filterwarnings('ignore', category=UserWarning, message=r'Parameter.*with value "None" is not of type string', module=r'luigi')
        tasks.main(argv=luigi_argv[1:], should_exit=False)