Example #1
0
def check_nvidia_smi(exit_if_fail=False, debug=False):
    """
    Make sure nvidia-smi runs fast enough to perform GPU utilization sampling.
    :return:
    """
    start_t = time.time()
    # $ nvidia-smi
    smi_output = run_nvidia_smi(debug=debug)
    end_t = time.time()
    nvidia_smi_sec = end_t - start_t
    if nvidia_smi_sec > MAX_NVIDIA_SMI_TIME_SEC:
        # $ sudo service nvidia-persistenced start
        errmsg = textwrap.dedent("""
        RL-Scope WARNING: nvidia-smi takes a long time to run on your system.
        In particular, it took {sec} sec to run nvidia-smi (we would prefer < {limit_sec}).
        This will interfere with sampling GPU utilization.
        You can fix this by running the following command:
        
        # Start systemd nvidia-persistenced service (if it's not already running).
        $ sudo nvidia-persistenced --persistence-mode
        
        For more details see:
        https://devtalk.nvidia.com/default/topic/1011192/nvidia-smi-is-slow-on-ubuntu-16-04-/
        """).format(
            sec=nvidia_smi_sec,
            limit_sec=MAX_NVIDIA_SMI_TIME_SEC,
        )
        if exit_if_fail:
            logger.error(errmsg)
            sys.exit(1)
        else:
            logger.warning(errmsg)
Example #2
0
def register_wrap_module(wrap_module, unwrap_module):
    wrapped_module = WrappedModule(wrap_module, unwrap_module)
    WRAPPED_MODULES.append(wrapped_module)
    if _LIBS_WRAPPED:
        logger.warning(
            f"Registering wrapped module late for {wrap_module.__name__}; you will miss recording calls that happpened before this point"
        )
        wrapped_module.wrap()
Example #3
0
def _add_rlscope_args(args):
    if not rlscope_api.is_used() and not get_arg(args, 'rlscope_disable'):
        logger.warning(
            textwrap.dedent("""\
            Skipping RL-Scope profiling; to run with RL-Scope prefix your command with:
              $ rls-prof ...
                --------
            """).rstrip())
        set_arg(args, 'rlscope_disable', True)
Example #4
0
def pdf2svg(path, output=None, can_skip=False):
    if output is None:
        output = re.sub(r'\.pdf$', '.svg', path)
        # If this fails, then "path" doesn't end with .pdf.
        assert path != output
    if not shutil.which('pdf2svg'):
        if can_skip:
            logger.warning(f"pdf2svg shell command not found; SKIP: pdf2svg {path} {output}")
            return
        else:
            raise RuntimeError(f"pdf2svg shell command not found for: \"pdf2svg {path} {output}\".  Install with \"sudo apt install pdf2svg\"")
    subprocess.check_call(['pdf2svg',
                           # input
                           path,
                           output,
                           ])
Example #5
0
    def dump_plot_index_py(self):
        cmd = sys.argv

        if not self.dry_run:
            src = _j(py_config.ROOT, 'rlscope/scripts/rlscope_plot_index.py')
            dst = _j(self.out_dir, 'rlscope_plot_index.py')
            logger.info("cp {src} -> {dst}".format(src=src, dst=dst))
            os.makedirs(_d(dst), exist_ok=True)
            shutil.copyfile(src, dst)

        os.makedirs(_d(self.plot_index_path), exist_ok=True)
        if _e(self.plot_index_path) and not self.replace:
            logger.warning(
                "{path} exists; skipping".format(path=self.plot_index_path))
            return

        with open(self.plot_index_path, 'w') as f:

            contents = textwrap.dedent("""\
                #!/usr/bin/env python3
                
                ### GENERATED FILE; do NOT modify!
                ### generated using: 
                ### CMD:
                ###   {cmd}
                ### PWD:
                ###   {pwd}
                
                DIRECTORY = "{dir}"
                INDEX = \\
                {index}
                """).format(
                dir=os.path.realpath(self.directory),
                index=textwrap.indent(pprint.pformat(self.index),
                                      prefix="    "),
                cmd=" ".join(cmd),
                pwd=os.getcwd(),
            )
            if self.debug:
                logger.info("> Generated file: {path}".format(
                    path=self.plot_index_path))
                logger.info(contents)

            if not self.dry_run:
                f.write(contents)
Example #6
0
def pdf2png(path, output=None, can_skip=True, silent=True):
    if not is_pdf(path) and can_skip:
        return
    if output is None:
        output = re.sub(r'\.pdf$', '.png', path)
        # If this fails, then "path" doesn't end with .pdf.
        assert path != output
    if not shutil.which('pdftoppm'):
        if can_skip:
            if not silent:
                logger.warning(f"pdftoppm shell command not found; SKIP: \"pdftoppm {path} {output}\"")
            return
        raise RLScopeConfigurationError(f"pdftoppm shell command not found for: \"pdftoppm {path} {output}\".  Install with \"sudo apt install poppler-utils\"")
    with open(output, 'wb') as f:
        subprocess.check_call([
            'pdftoppm',
            # input
            path,
            '-png',
            # first page
            '-f', '1',
            # single page pdf
            '-singlefile',
        ], stdout=f)
Example #7
0
    def mode_run_sh(self):
        # Fill queue with commands to run.
        run_commands = self.run_commands()
        for run_cmd in run_commands:
            logger.debug(f"Put: {run_cmd}")
            self.cmd_queue.put(run_cmd)

        self.start_gpu_workers()

        bar = None
        if self.should_show_progress:
            bar = progressbar.ProgressBar(max_value=len(run_commands))
        last_completed = None

        # Wait for workers to terminate
        try:
            while True:
                if self.should_show_progress:
                    completed = len(run_commands) - self.cmd_queue.qsize()
                    if last_completed is None or completed > last_completed:
                        bar.update(completed)
                    last_completed = completed

                if self.worker_failed.is_set():
                    self.stop_workers()
                    # ; use --skip-errors to ignore failed commands.
                    if not self.skip_final_error_message:
                        logger.error("At least one command failed with non-zero exit status")
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(1)

                alive_workers = 0
                failed_workers = 0
                for gpu, worker in self.gpu_workers.items():
                    if worker.is_alive():
                        alive_workers += 1
                        continue

                    if worker.exitcode < 0:
                        logger.error("GPU[{gpu}] worker failed with exitcode={ret} (unhandled exception)".format(
                            gpu=gpu,
                            ret=worker.exitcode,
                        ))
                        self.worker_failed.set()
                        failed_workers += 1

                if failed_workers > 0:
                    self.stop_workers()
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(1)

                if alive_workers == 0:
                    if self.cmd_queue.qsize() > 0:
                        logger.warning("GPU workers have finished with {len} remaining commands unfinished".format(
                            len=self.cmd_queue.qsize()
                        ))
                        sys.exit(1)
                    logger.debug("GPU workers have finished successfully".format(
                        len=self.cmd_queue.qsize()
                    ))
                    if self.should_show_progress:
                        bar.finish()
                    sys.exit(0)

                time.sleep(2)
        except KeyboardInterrupt:
            logger.info("Saw Ctrl-C; waiting for workers to terminate")
            self.stop_workers()
            logger.warning("{len} remaining commands went unprocessed".format(len=self.cmd_queue.qsize()))
            if self.should_show_progress:
                bar.finish()
            sys.exit(1)
Example #8
0
NOTE: We don't bother to wrap export_saved_model, simply because it doesn't expose any convenient hooks...
but we COULD wrap it (minigo doesn't use it).
"""

from rlscope.profiler.rlscope_logging import logger
from rlscope import py_config

SKIP_MODULE = False
if py_config.is_running_unit_tests():
    try:
        import tensorflow
        from tensorflow.python.training.training import CheckpointSaverListener
    except ImportError:
        SKIP_MODULE = True
        logger.warning(
            "SKIP import of {path} during pytest; TensorFlow not installed".
            format(path=__file__, ))

# Python cannot "return" from a module, hence the giant if statement.
if not SKIP_MODULE:
    import tensorflow as tf

    import rlscope

    from tensorflow.python.training.training import CheckpointSaverListener
    """
    Wrap tf.estimator.Estimator.
    """
    old_Estimator = None

    def setup_wrap_Estimator():
Example #9
0
def _sel_all(selector,
             sel_order,
             level,
             md,
             subtree,
             skip_missing_fields=False,
             debug=False):
    """
    Given a subtree key-ed like:
    subtree = {
      sel_order[level]: {
          <sel_order[level] value>: ...
      }
    }
    Recursively iterate according to sel_order[i], using selector
    to decide which subtrees to visit at each level.

    :param selector:
    :param idx:
        A subtree of the INDEX, where:
        keys = values of type 'sel_field'

        Initially when _sel is first called, the idx is INDEX, and the subtree is key-ed by plot-type (e.g. ResourceSubplot).

    :param sel_field:
        Initially when _sel is first called, sel_field = 'overlap_type'.

    :return:
    """
    if debug:
        logger.debug(f"level = {level}")
    while True:
        if level == len(sel_order):
            yield dict(md), subtree
            return
        field = sel_order[level]

        if field not in subtree and skip_missing_fields:
            # Subtree is missing field, but there's only one choice of field-value to use.
            logger.warning("Skipping field={field} since it is missing".format(
                field=field))
            level += 1
        elif field in subtree:
            break
        else:
            raise RuntimeError(
                "Didn't find field={field} in selector; options are {fields}".
                format(
                    field=field,
                    fields=sorted(subtree.keys()),
                ))

    for value, next_subtree in _sel(selector,
                                    subtree[field],
                                    field,
                                    skip_missing_fields=skip_missing_fields,
                                    debug=debug):
        md[field] = value
        for md, entry in _sel_all(selector,
                                  sel_order,
                                  level + 1,
                                  md,
                                  next_subtree,
                                  skip_missing_fields=skip_missing_fields,
                                  debug=debug):
            yield md, entry
Example #10
0
def main():

    try:
        check_host.check_config()
    except RLScopeConfigurationError as e:
        logger.error(e)
        sys.exit(1)

    parser = argparse.ArgumentParser(
        description=textwrap.dedent("""\
        Process trace-files collected from running an ML script with the RL-Scope profiler.
        
        For task-specific help, provided task-name and --help, e.g.:
        $ rls-run --task OverlapStackedBarTask --help
        
        NOTE: 
        - This script is a thin usage/debugging wrapper around a "luigi" DAG execution script. 
          It just forwards arguments to it.
        - Any unparsed args are forward to the luigi script.
        """),
        formatter_class=argparse.RawTextHelpFormatter,
        add_help=False,
    )
    parser.add_argument('--pdb', action='store_true',
                        help="Break into pdb when an exception occurs")
    parser.add_argument('--task',
                        choices=[klass.__name__ for klass in tasks.RLSCOPE_TASKS],
                        help="Name of a runnable IMLTask defined in rlscope.parser.tasks")
    parser.add_argument('--workers',
                        type=int,
                        # DISABLE --workers for now to prevent opening to many postgres connections by accident;
                        # we parallelize internally instead
                        # e.g. ResourceOverlap with 32 worker threads, each of which opens a SQL
                        # connection.
                        # default=multiprocessing.cpu_count(),
                        default=1,
                        help="Maximum number of parallel tasks to run (luigi parameter)")
    parser.add_argument('--help', '-h',
                        action='store_true')
    args, luigi_argv = parser.parse_known_args(sys.argv)

    if args.help and not args.task:
        # Print available tasks.
        parser.print_help()
        sys.exit(0)

    if args.task is None and not args.help:
        # If they just run this:
        # $ rls-run --rlscope-directory <dir>
        # Then run all the targets.
        args.task = 'All'

    extra_argv = [
        '--module', 'rlscope.parser.tasks',
        '--local-scheduler',
        # Default log-level from luigi is DEBUG which is too noisy.
        # Make the default level INFO instead.
        '--log-level', 'INFO',
    ]
    luigi_argv.extend(extra_argv)
    if args.task:
        # Task needs to be the first argument after rls-run.
        luigi_argv.insert(1, args.task)

    if args.help:
        luigi_argv.extend(['--help'])

    if args.workers > 1:
        logger.warning("Each overlap plot uses all the cores; forcing --workers=1")
        args.workers = 1

    if args.pdb:
        logger.debug("Registering pdb breakpoint (--pdb)")
        register_pdb_breakpoint()
        # Debugger is useless when multithreaded.
        args.workers = 1

    luigi_argv.extend(['--workers', str(args.workers)])

    # logger.debug("Luigi arguments:\n{msg}".format(msg=textwrap.indent(pprint.pformat({
    #     'luigi_argv':luigi_argv,
    #     'sys.argv':sys.argv,
    # }), prefix='  ')))

    with warnings.catch_warnings():
        # I don't really take much advantage of luigi's DFS scheduler and instead run things manually.
        # Oh well.
        warnings.filterwarnings('ignore', category=UserWarning, message=r'.*without outputs has no custom complete', module=r'luigi')
        warnings.filterwarnings('ignore', category=UserWarning, message=r'Parameter.*with value "None" is not of type string', module=r'luigi')
        tasks.main(argv=luigi_argv[1:], should_exit=False)