def run(self, test_name): self._app_conf.write() self._ctl_conf.write() with open(test_name + '.log', 'a') as outfile: outfile.write(str(datetime.datetime.now()) + '\n') outfile.flush() source_dir = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) # Using libtool causes sporadic issues with the Intel toolchain. exec_path = os.path.join(source_dir, '.libs', 'geopmbench') argv = [ 'dummy', '--geopm-ctl', self._pmpi_ctl, '--geopm-policy', self._ctl_conf.get_path(), '--geopm-report', self._report_path, '--geopm-profile', test_name ] if self._trace_path is not None: argv.extend(['--geopm-trace', self._trace_path]) if self._region_barrier: argv.append('--geopm-barrier') argv.extend( ['--', exec_path, '--verbose', self._app_conf.get_path()]) launcher = geopmpy.launcher.factory( argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, test_name, self._node_list, self._host_file) launcher.run(stdout=outfile, stderr=outfile)
def launch(self, geopm_ctl='process', do_geopm_barrier=False): ctl_conf = geopmpy.io.CtlConf(self._name + '_ctl.config', 'dynamic', {'power_budget': 400, 'tree_decider': 'static_policy', 'leaf_decider': 'simple_freq', 'platform': 'rapl'}) ctl_conf.write() if 'GEOPM_SIMPLE_FREQ_RID_MAP' in os.environ: del os.environ['GEOPM_SIMPLE_FREQ_RID_MAP'] if 'GEOPM_SIMPLE_FREQ_ADAPTIVE' in os.environ: del os.environ['GEOPM_SIMPLE_FREQ_ADAPTIVE'] for freq in sys_freq_avail(): profile_name = fixed_freq_name(self._name, freq) report_path = os.path.join(self._output_dir, profile_name + '.report') self._report_paths.append(report_path) if self._app_argv and not os.path.exists(report_path): os.environ['GEOPM_SIMPLE_FREQ_MIN'] = str(freq) os.environ['GEOPM_SIMPLE_FREQ_MAX'] = str(freq) argv = ['dummy', '--geopm-ctl', geopm_ctl, '--geopm-policy', ctl_conf.get_path(), '--geopm-report', report_path, '--geopm-profile', profile_name] if do_geopm_barrier: argv.append('--geopm-barrier') argv.append('--') argv.extend(self._app_argv) launcher = geopmpy.launcher.factory(argv, self._num_rank, self._num_node) launcher.run() elif os.path.exists(report_path): sys.stderr.write('<geopmpy>: Warning: output file "{}" exists, skipping run.\n'.format(report_path)) else: raise RuntimeError('<geopmpy>: output file "{}" does not exist, but no application was specified.\n'.format(report_path))
def run(self, test_name): self._app_conf.write() self._agent_conf.write() with open(test_name + '.log', 'a') as outfile: outfile.write(str(datetime.datetime.now()) + '\n') outfile.flush() argv = [ 'dummy', detect_launcher(), '--geopm-ctl', self._pmpi_ctl, '--geopm-agent', self._agent_conf.get_agent(), '--geopm-policy', self._agent_conf.get_path(), '--geopm-profile', test_name ] if self._report_path is not None: argv.extend(['--geopm-report', self._report_path]) if self._trace_path is not None: argv.extend(['--geopm-trace', self._trace_path]) if self._region_barrier: argv.append('--geopm-region-barrier') argv.extend(['--']) exec_wrapper = os.getenv('GEOPM_EXEC_WRAPPER', '') if exec_wrapper: argv.extend(shlex.split(exec_wrapper)) # Use app config to get path and arguements argv.append(self._app_conf.get_exec_path()) argv.append('--verbose') argv.extend(self._app_conf.get_exec_args()) launcher = geopmpy.launcher.Factory().create( argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, test_name, self._node_list, self._host_file) launcher.run(stdout=outfile, stderr=outfile) if self._msr_save_path is not None: msr_restore()
def set_num_cpu(self): # Figure out the number of CPUs per rank leaving one for the # OS and one (potentially, may/may not be use depending on pmpi_ctl) # for the controller. argv = ['dummy', detect_launcher(), '--geopm-ctl-disable', 'lscpu'] launcher = geopmpy.launcher.Factory().create(argv, 1, 1) ostream = io.StringIO() launcher.run(stdout=ostream) out = ostream.getvalue() cpu_thread_core_socket = [ int(line.split(':')[1]) for line in out.splitlines() if line.find('CPU(s):') == 0 or line.find('Thread(s) per core:') == 0 or line.find( 'Core(s) per socket:') == 0 or line.find('Socket(s):') == 0 ] if self._performance == True: # Mulitply num core per socket by num sockets, subtract 1, then multiply by threads per core. # Remove one CPU for BSP to calculate number of CPU for application. # Use hyper-threads. self._num_cpu = ( (cpu_thread_core_socket[2] * cpu_thread_core_socket[3]) - 1) * cpu_thread_core_socket[1] else: # Mulitply num core per socket by num socket and remove one # CPU for BSP to calculate number of CPU for application. # Don't use hyper-threads. self._num_cpu = cpu_thread_core_socket[2] * cpu_thread_core_socket[ 3] - 1
def check_run(self, test_name): with open(test_name + '.log', 'a') as outfile: argv = ['dummy', detect_launcher(), '--geopm-ctl-disable', 'true'] launcher = geopmpy.launcher.Factory().create( argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, self._job_name, self._node_list, self._exclude_list, self._host_file) launcher.run(stdout=outfile, stderr=outfile)
def check_run(self, test_name): with open(test_name + '.log', 'a') as outfile: argv = ['dummy', 'true'] launcher = geopmpy.launcher.factory( argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, self._job_name, self._node_list, self._host_file) launcher.run(stdout=outfile, stderr=outfile)
def run(self, test_name, include_geopm_policy=True, add_geopm_args=[]): """ Run the test as configured at construction time. Arguments: test_name (str): Name of the test run to use for log files and the policy name in reports. include_geopm_policy (bool): If True (default), provide --geopm-policy for the agent """ self._app_conf.write() self._agent_conf.write() with open(test_name + '.log', 'a') as outfile: outfile.write(str(datetime.datetime.now()) + '\n') outfile.flush() argv = [ 'dummy', detect_launcher(), '--geopm-ctl', self._pmpi_ctl, '--geopm-agent', self._agent_conf.get_agent(), '--geopm-profile', test_name ] if include_geopm_policy: argv.extend(['--geopm-policy', self._agent_conf.get_path()]) if self._report_path is not None: argv.extend(['--geopm-report', self._report_path]) if self._trace_path is not None: argv.extend(['--geopm-trace', self._trace_path]) if self._region_barrier: argv.append('--geopm-region-barrier') if self._disable_ompt: argv.append('--geopm-ompt-disable') if self._trace_profile_path: argv.extend( ['--geopm-trace-profile', self._trace_profile_path]) if self._report_signals: argv.extend(['--geopm-report-signals', self._report_signals]) if self._trace_signals: argv.extend(['--geopm-trace-signals', self._trace_signals]) argv.extend(add_geopm_args) argv.extend(['--']) exec_wrapper = os.getenv('GEOPM_EXEC_WRAPPER', '') if exec_wrapper: argv.extend(shlex.split(exec_wrapper)) # Use app config to get path and arguements argv.append(self._app_conf.get_exec_path()) argv.append('--verbose') argv.extend(self._app_conf.get_exec_args()) launcher = geopmpy.launcher.Factory().create( argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, test_name, self._node_list, self._exclude_list, self._host_file) try: launcher.run(stdout=outfile, stderr=outfile) except (AttributeError, TypeError): if self._msr_save_path is not None: self.msr_restore() raise
def launch(self, geopm_ctl='process', do_geopm_barrier=False): ctl_conf = geopmpy.io.CtlConf( self._name + '_ctl.config', 'dynamic', { 'power_budget': 400, 'tree_decider': 'static_policy', 'leaf_decider': 'efficient_freq', 'platform': 'rapl' }) if os.getenv("GEOPM_AGENT", None) is None: ctl_conf.write() if 'GEOPM_EFFICIENT_FREQ_RID_MAP' in os.environ: del os.environ['GEOPM_EFFICIENT_FREQ_RID_MAP'] if 'GEOPM_EFFICIENT_FREQ_ONLINE' in os.environ: del os.environ['GEOPM_EFFICIENT_FREQ_ONLINE'] freqs = sys_freq_avail() for iteration in range(self._iterations): for freq in freqs: profile_name = fixed_freq_name(self._name, freq) report_path = os.path.join( self._output_dir, profile_name + '_{}.report'.format(iteration)) trace_path = os.path.join( self._output_dir, profile_name + '_{}.trace'.format(iteration)) self._report_paths.append(report_path) if os.getenv("GEOPM_AGENT", None) is not None: with open(ctl_conf.get_path(), "w") as outfile: outfile.write( "{{\"FREQ_MIN\" : {}, \"FREQ_MAX\" : {}}}\n". format(freq, freq)) if self._app_argv and not os.path.exists(report_path): os.environ['GEOPM_EFFICIENT_FREQ_MIN'] = str(freq) os.environ['GEOPM_EFFICIENT_FREQ_MAX'] = str(freq) argv = [ 'dummy', '--geopm-ctl', geopm_ctl, '--geopm-policy', ctl_conf.get_path(), '--geopm-report', report_path, '--geopm-trace', trace_path, '--geopm-profile', profile_name ] if do_geopm_barrier: argv.append('--geopm-barrier') argv.append('--') argv.extend(self._app_argv) launcher = geopmpy.launcher.factory( argv, self._num_rank, self._num_node) launcher.run() elif os.path.exists(report_path): sys.stderr.write( '<geopmpy>: Warning: output file "{}" exists, skipping run.\n' .format(report_path)) else: raise RuntimeError( '<geopmpy>: output file "{}" does not exist, but no application was specified.\n' .format(report_path))
def allocation_node_test(test_exec, stdout, stderr): argv = shlex.split(test_exec) launcher = detect_launcher() argv.insert(1, launcher) if launcher == 'aprun': argv.insert(2, '-q') # Use quiet flag with aprun to suppress end of job info string argv.insert(2, '--geopm-ctl-disable') launcher = geopmpy.launcher.Factory().create(argv, num_rank=1, num_node=1, job_name="geopm_allocation_test") launcher.run(stdout, stderr)
def msr_restore(self): """ Restores all whitelisted MSRs using msrsave on all compute nodes that the job was launched on. """ # Create the cache for the PlatformTopo on each compute node if self._msr_save_path is not None: launch_command = 'msrsave -r ' + self._msr_save_path argv = shlex.split('dummy {} --geopm-ctl-disable -- {}' .format(detect_launcher(), launch_command)) launcher = geopmpy.launcher.Factory().create(argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, 'msr_save', self._node_list, self._host_file) launcher.run()
def msr_save(self): """ Snapshots all whitelisted MSRs using msrsave on all compute nodes that the job will be launched on. """ # Create the cache for the PlatformTopo on each compute node self._msr_save_path = '/tmp/geopm-msr-save-' + getpass.getuser() launch_command = 'msrsave ' + self._msr_save_path argv = shlex.split('dummy {} --geopm-ctl-disable -- {}' .format(detect_launcher(), launch_command)) launcher = geopmpy.launcher.Factory().create(argv, self._num_rank, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, 'msr_save', self._node_list, self._host_file) launcher.run()
def test_non_file_output(self, mock_popen): """ Test that the launcher can redirect stdout and stderr to a non-file writable object. """ out_stream = StringIO() error_stream = StringIO() launcher = geopmpy.launcher.Factory().create( ['unittest_geopm_launcher', 'srun', 'unittest_workload'], num_rank=2, num_node=1) launcher.run(stdout=out_stream, stderr=error_stream) self.assertIn(UNITTEST_WORKLOAD_STDOUT.decode(), out_stream.getvalue()) self.assertIn(UNITTEST_WORKLOAD_STDERR.decode(), error_stream.getvalue())
def launch(self, geopm_ctl='process', do_geopm_barrier=False): """ Run the frequency sweep, then run the desired comparison configuration. """ ctl_conf = geopmpy.io.CtlConf(self._name + '_ctl.config', 'dynamic', {'power_budget': 400, 'tree_decider': 'static_policy', 'leaf_decider': 'simple_freq', 'platform': 'rapl'}) ctl_conf.write() # Run frequency sweep self._sweep_analysis.launch(geopm_ctl, do_geopm_barrier) parse_output = self._sweep_analysis.parse() process_output = self._sweep_analysis.report_process(parse_output) region_freq_str = self._sweep_analysis._region_freq_str(process_output) # Run online frequency decider os.environ['GEOPM_SIMPLE_FREQ_RID_ADAPTIVE'] = 'yes' if 'GEOPM_SIMPLE_FREQ_MAP' in os.environ: del os.environ['GEOPM_SIMPLE_FREQ_MAP'] profile_name = self._name + '_online' report_path = os.path.join(self._output_dir, profile_name + '.report') self._report_paths.append(report_path) self._min_freq = min(sys_freq_avail()) self._max_freq = max(sys_freq_avail()) if self._app_argv and not os.path.exists(report_path): os.environ['GEOPM_SIMPLE_FREQ_MIN'] = str(self._min_freq) os.environ['GEOPM_SIMPLE_FREQ_MAX'] = str(self._max_freq) argv = ['dummy', '--geopm-ctl', geopm_ctl, '--geopm-policy', ctl_conf.get_path(), '--geopm-report', report_path, '--geopm-profile', profile_name] if do_geopm_barrier: argv.append('--geopm-barrier') argv.append('--') argv.extend(self._app_argv) launcher = geopmpy.launcher.factory(argv, self._num_rank, self._num_node) launcher.run() elif os.path.exists(report_path): sys.stderr.write('<geopmpy>: Warning: output file "{}" exists, skipping run.\n'.format(report_path)) else: raise RuntimeError('<geopmpy>: output file "{}" does not exist, but no application was specified.\n'.format(report_path))
def set_num_cpu(self): # Figure out the number of CPUs per rank leaving one for the # OS and one (potentially, may/may not be use depending on pmpi_ctl) # for the controller. argv = ['dummy', 'lscpu'] launcher = geopmpy.launcher.factory(argv, 1, 1) ostream = StringIO.StringIO() launcher.run(stdout=ostream) out = ostream.getvalue() core_socket = [int(line.split(':')[1]) for line in out.splitlines() if line.find('Core(s) per socket:') == 0 or line.find('Socket(s):') == 0] # Mulitply num core per socket by num socket and remove one # CPU for BSP to calculate number of CPU for application. # Don't use hyper-threads. self._num_cpu = core_socket[0] * core_socket[1] - 1
def msr_restore(self): """ Restores all whitelisted MSRs using msrsave on all compute nodes that the job was launched on. """ if self._msr_save_path is not None: launch_command = 'msrsave -r ' + self._msr_save_path argv = shlex.split('dummy {} --geopm-ctl-disable -- {}'.format( detect_launcher(), launch_command)) # We want to execute on every node so # (argv, self._num_node, self._num_node, ... # is intentional here and is the best we can do # without a whitelist of node names launcher = geopmpy.launcher.Factory().create( argv, self._num_node, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, 'msr_save', self._node_list, self._exclude_list, self._host_file) launcher.run()
def msr_save(self): """ Snapshots all whitelisted MSRs using msrsave on all compute nodes that the job will be launched on. """ self._msr_save_path = '/tmp/geopm-msr-save-' + getpass.getuser() launch_command = 'msrsave ' + self._msr_save_path argv = shlex.split('dummy {} --geopm-ctl-disable -- {}'.format( detect_launcher(), launch_command)) # We want to execute on every node so # (argv, self._num_node, self._num_node, ... # is intentional here and is the best we can do # without a whitelist of node names launcher = geopmpy.launcher.Factory().create( argv, self._num_node, self._num_node, self._cpu_per_rank, self._timeout, self._time_limit, 'msr_save', self._node_list, self._host_file) launcher.run()
def test_process_count(self, mock_popen): """ Test that geopm requests an additional rank for itself by default. """ launcher = geopmpy.launcher.Factory().create( ['unittest_geopm_launcher', 'srun', 'unittest_workload'], num_rank=2, num_node=1) launcher.run() srun_args, srun_kwargs = mock_popen.call_args if srun_kwargs.get('shell', False): srun_args = shlex.split(srun_args[0]) self.assertIn('-N', srun_args) self.assertEqual('1', srun_args[srun_args.index('-N') + 1]) # Expect an additional rank per node for the geopm process self.assertIn('-n', srun_args) self.assertEqual('3', srun_args[srun_args.index('-n') + 1])
def test_policy_custom(self): profile = 'power_custom' report_path = profile + '.report' policy_trace = profile + '.trace-policy' self._files.append(report_path) self._files.append(policy_trace) self._argv.extend(['--geopm-profile', profile]) self._argv.extend(['--geopm-trace-endpoint-policy', policy_trace]) self._argv.extend(['--geopm-report', report_path]) self._argv.append(self._app_conf.get_exec_path()) self._argv.extend(self._app_conf.get_exec_args()) launcher = geopmpy.launcher.Factory().create(self._argv, self._num_rank, self._num_node) launcher.run() report_data = geopmpy.io.RawReport(report_path).meta_data() self.assertEqual(report_data['Profile'], profile) policy = report_data['Policy'] self.assertEqual(policy, 'DYNAMIC') # check profile trace for single line with this power cap csv_data = pandas.read_csv(policy_trace, delimiter='|', comment='#') self.assertEqual(csv_data['POWER_PACKAGE_LIMIT_TOTAL'][0], self.custom_power_cap)
def launch(self, geopm_ctl='process', do_geopm_barrier=False): """ Run the frequency sweep, then run the desired comparison configuration. """ ctl_conf = geopmpy.io.CtlConf( self._name + '_ctl.config', 'dynamic', { 'power_budget': 400, 'tree_decider': 'static_policy', 'leaf_decider': 'efficient_freq', 'platform': 'rapl' }) if os.getenv("GEOPM_AGENT", None) is None: ctl_conf.write() # Run frequency sweep self._sweep_analysis.launch(geopm_ctl, do_geopm_barrier) # Run online frequency decider for iteration in range(self._iterations): os.environ['GEOPM_EFFICIENT_FREQ_ONLINE'] = 'yes' if 'GEOPM_EFFICIENT_FREQ_RID_MAP' in os.environ: del os.environ['GEOPM_EFFICIENT_FREQ_RID_MAP'] profile_name = self._name + '_online' report_path = os.path.join( self._output_dir, profile_name + '_{}.report'.format(iteration)) trace_path = os.path.join( self._output_dir, profile_name + '_{}.trace'.format(iteration)) self._report_paths.append(report_path) freqs = sys_freq_avail( ) # freqs contains a list of available system frequencies in ascending order self._min_freq = freqs[0] self._max_freq = freqs[-1] if self._enable_turbo else freqs[-2] if os.getenv("GEOPM_AGENT", None) is not None: with open(ctl_conf.get_path(), "w") as outfile: outfile.write( "{{\"FREQ_MIN\" : {}, \"FREQ_MAX\" : {}}}\n".format( self._min_freq, self._max_freq)) if self._app_argv and not os.path.exists(report_path): os.environ['GEOPM_EFFICIENT_FREQ_MIN'] = str(self._min_freq) os.environ['GEOPM_EFFICIENT_FREQ_MAX'] = str(self._max_freq) argv = [ 'dummy', '--geopm-ctl', geopm_ctl, '--geopm-policy', ctl_conf.get_path(), '--geopm-report', report_path, '--geopm-trace', trace_path, '--geopm-profile', profile_name ] if do_geopm_barrier: argv.append('--geopm-barrier') argv.append('--') argv.extend(self._app_argv) launcher = geopmpy.launcher.factory(argv, self._num_rank, self._num_node) launcher.run() elif os.path.exists(report_path): sys.stderr.write( '<geopmpy>: Warning: output file "{}" exists, skipping run.\n' .format(report_path)) else: raise RuntimeError( '<geopmpy>: output file "{}" does not exist, but no application was specified.\n' .format(report_path))
def launch_run(agent_conf, app_conf, run_id, output_dir, extra_cli_args, num_nodes, enable_traces, enable_profile_traces): # launcher and app should create files in output_dir start_dir = os.getcwd() os.chdir(output_dir) # TODO: why does launcher strip off first arg, rather than geopmlaunch main? argv = ['dummy', util.detect_launcher()] agent_name = 'monitor' if agent_conf is not None: agent_name = agent_conf.get_agent() agent_conf.write() if agent_name != 'monitor': argv.append('--geopm-agent=' + agent_conf.get_agent()) argv.append('--geopm-policy=' + agent_conf.get_path()) app_name = app_conf.name() uid = '{}_{}_{}'.format(app_name.lower(), agent_name, run_id) report_path = '{}.report'.format(uid) trace_path = '{}.trace'.format(uid) profile_trace_path = '{}.ptrace'.format(uid) profile_name = uid log_path = '{}.log'.format(uid) sys.stdout.write('Run commencing...\nLive job output will be written to: {}\n' .format(os.path.join(output_dir, log_path))) # TODO: these are not passed to launcher create() # some are generic enough they could be, though extra_cli_args += ['--geopm-report', report_path, '--geopm-profile', profile_name, ] if enable_traces: extra_cli_args += ['--geopm-trace', trace_path] if enable_profile_traces: extra_cli_args += ['--geopm-trace-profile', profile_trace_path] argv.extend(extra_cli_args) # extra geopm args needed by app argv.extend(app_conf.get_custom_geopm_args()) argv.extend(['--']) bash_path = apps.make_bash(app_conf, run_id, log_path) argv.extend([bash_path]) num_ranks = app_conf.get_rank_per_node() * num_nodes cpu_per_rank = app_conf.get_cpu_per_rank() launcher = geopmpy.launcher.Factory().create(argv, num_node=num_nodes, num_rank=num_ranks, cpu_per_rank=cpu_per_rank) launcher.run() # Get app-reported figure of merit fom = app_conf.parse_fom(log_path) # Append to report with open(report_path, 'a') as report: report.write('\nFigure of Merit: {}\n'.format(fom)) # return to previous directory os.chdir(start_dir)