def test_count(self): name = 'test_count' report_path = name + '.report' trace_path = name + '.trace' num_node = 1 num_rank = 4 delay = 0.01 loop_count = 100 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.set_loop_count(loop_count) app_conf.append_region('spin', delay) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path, time_limit=None) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(len(node_names), num_node) for nn in node_names: rr = self._output.get_report(nn) self.assertNear(delay * loop_count, rr['spin'].get_runtime()) self.assertEqual(loop_count, rr['spin'].get_count()) self.assertEqual(loop_count, rr['epoch'].get_count())
def test_report_and_trace_generation(self): name = 'test_report_and_trace_generation' report_path = name + '.report' trace_path = name + '.trace' num_node = 4 num_rank = 16 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('sleep', 1.0) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) for nn in node_names: report = self._output.get_report(nn) self.assertNotEqual(0, len(report)) trace = self._output.get_trace(nn) self.assertNotEqual(0, len(trace))
def test_runtime_nested(self): name = 'test_runtime_nested' report_path = name + '.report' num_node = 1 num_rank = 1 delay = 1.0 loop_count = 2 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.set_loop_count(loop_count) app_conf.append_region('nested-progress', delay) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, time_limit=None) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) for nn in node_names: rr = self._output.get_report(nn) # The spin sections of this region sleep for 'delay' seconds twice per loop. self.assertNear(2 * loop_count * delay, rr['spin'].get_runtime()) self.assertNear(rr['spin'].get_runtime(), rr['epoch'].get_runtime(), epsilon=0.01) self.assertGreater(rr.get_mpi_runtime(), 0) self.assertGreater(0.1, rr.get_mpi_runtime()) self.assertEqual(loop_count, rr['spin'].get_count())
def test_scaling(self): """ This test will start at ${num_node} nodes and ranks. It will then calls check_run() to ensure that commands can be executed successfully on all of the allocated compute nodes. Afterwards it will run the specified app config on each node and verify the reports. When complete it will double num_node and run the steps again. WARNING: This test can take a long time to run depending on the number of starting nodes and the size of the allocation. """ name = 'test_scaling' report_path = name + '.report' num_node = 2 loop_count = 100 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('dgemm', 1.0) app_conf.append_region('all2all', 1.0) app_conf.set_loop_count(loop_count) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, time_limit=None) check_successful = True while check_successful: launcher.set_num_node(num_node) launcher.set_num_rank(num_node) try: launcher.check_run(name) except subprocess.CalledProcessError as e: # If we exceed the available nodes in the allocation ALPS/SLURM give a rc of 1 # All other rc's are real errors if e.returncode != 1: raise e check_successful = False if check_successful: launcher.write_log( name, 'About to run on {} nodes.'.format(num_node)) launcher.run(name) self._output = geopm_io.AppOutput(report_path) node_names = self._output.get_node_names() self.assertEqual(len(node_names), num_node) for nn in node_names: rr = self._output.get_report(nn) self.assertEqual(loop_count, rr['dgemm'].get_count()) self.assertEqual(loop_count, rr['all2all'].get_count()) self.assertGreater(rr['dgemm'].get_runtime(), 0.0) self.assertGreater(rr['all2all'].get_runtime(), 0.0) num_node *= 2 self._output.remove_files()
def test_report_generation_all_nodes(self): name = 'test_report_generation_all_nodes' report_path = name + '.report' num_node = 1 num_rank = 1 delay = 1.0 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('sleep', delay) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) time.sleep( 5) # Wait a moment to finish cleaning-up from a previous test idle_nodes = launcher.get_idle_nodes() idle_nodes_copy = list(idle_nodes) alloc_nodes = launcher.get_alloc_nodes() launcher.write_log(name, 'Idle nodes : {nodes}'.format(nodes=idle_nodes)) launcher.write_log( name, 'Alloc\'d nodes : {nodes}'.format(nodes=alloc_nodes)) for n in idle_nodes_copy: launcher.set_node_list(n.split()) # Hack to convert string to list try: launcher.run(name) except subprocess.CalledProcessError as e: if e.returncode == 1 and n not in launcher.get_idle_nodes(): launcher.write_log( name, '{node} has disappeared from the idle list!'.format( node=n)) idle_nodes.remove(n) else: launcher.write_log( name, 'Return code = {code}'.format(code=e.returncode)) raise e self._output = geopm_io.AppOutput(report_path) node_names = self._output.get_node_names() self.assertEqual(len(node_names), len(idle_nodes)) for nn in node_names: report = self._output.get_report(nn) self.assertNotEqual(0, len(report)) self.assertNear(delay, report['sleep'].get_runtime()) self.assertGreater(report.get_runtime(), report['sleep'].get_runtime()) self.assertEqual(1, report['sleep'].get_count())
def test_progress_exit(self): """ Check that when we always see progress exit before the next entry. Make sure that progress only decreases when a new region is entered. """ name = 'test_progress_exit' report_path = name + '.report' trace_path = name + '.trace' num_node = 1 num_rank = 16 loop_count = 100 big_o = 1.0 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.set_loop_count(loop_count) app_conf.append_region('dgemm-progress', big_o) app_conf.append_region('spin-progress', 0.01) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path, time_limit=None) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) for nn in node_names: rr = self._output.get_report(nn) tt = self._output.get_trace(nn) tt = tt.set_index(['region_id'], append=True) tt = tt.groupby(level=['region_id']) for region_id, data in tt: if region_id != '0': tmp = data['progress-0'].diff() negative_progress = tmp.loc[(tmp > -1) & (tmp < 0)] launcher.write_log(name, '{}'.format(negative_progress)) self.assertEqual(0, len(negative_progress))
def test_sample_rate(self): """ Check that sample rate is regular and fast. """ name = 'test_sample_rate' report_path = name + '.report' trace_path = name + '.trace' num_node = 1 num_rank = 16 loop_count = 10 big_o = 10.0 region = 'dgemm-progress' max_mean = 0.01 # 10 millisecon max sample period max_nstd = 0.1 # 10% normalized standard deviation (std / mean) app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.set_loop_count(loop_count) app_conf.append_region(region, big_o) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path, time_limit=None) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) for nn in node_names: rr = self._output.get_report(nn) tt = self._output.get_trace(nn) delta_t = tt['seconds'].diff() delta_t = delta_t.loc[delta_t != 0] self.assertGreater(max_mean, delta_t.mean()) self.assertGreater(max_nstd, delta_t.std() / delta_t.mean())
def test_trace_runtimes(self): name = 'test_trace_generation' report_path = name + '.report' trace_path = name + '.trace' num_node = 4 num_rank = 16 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('sleep', 1.0) app_conf.append_region('dgemm', 1.0) app_conf.append_region('all2all', 1.0) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(len(node_names), num_node) for nn in node_names: report = self._output.get_report(nn) trace = self._output.get_trace(nn) self.assertNear(trace.iloc[-1]['seconds'], report.get_runtime()) # Calculate runtime totals for each region in each trace, compare to report tt = trace.set_index(['region_id'], append=True) tt = tt.groupby(level=['region_id']) for region_name, region_data in report.iteritems(): if region_data.get_runtime() != 0: trace_data = tt.get_group((region_data.get_id())) trace_elapsed_time = trace_data.iloc[-1][ 'seconds'] - trace_data.iloc[0]['seconds'] self.assertNear(trace_elapsed_time, region_data.get_runtime())
def test_runtime(self): name = 'test_runtime' report_path = name + '.report' num_node = 1 num_rank = 5 delay = 3.0 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('sleep', delay) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) for nn in node_names: rr = self._output.get_report(nn) self.assertNear(delay, rr['sleep'].get_runtime()) self.assertGreater(rr.get_runtime(), rr['sleep'].get_runtime())
def test_power_consumption(self): name = 'test_power_consumption' report_path = name + '.report' trace_path = name + '.trace' num_node = 4 num_rank = 16 loop_count = 500 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('dgemm', 8.0) app_conf.set_loop_count(loop_count) self._options['power_budget'] = 150 ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path, time_limit=15) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.write_log( name, 'Power cap = {}W'.format(self._options['power_budget'])) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(num_node, len(node_names)) all_power_data = {} # Total power consumed will be Socket(s) + DRAM for nn in node_names: rr = self._output.get_report(nn) tt = self._output.get_trace(nn) first_epoch_index = tt.loc[tt['region_id'] == '9223372036854775808'][:1].index[0] epoch_dropped_data = tt[ first_epoch_index:] # Drop all startup data power_data = epoch_dropped_data.filter(regex='energy') power_data['seconds'] = epoch_dropped_data['seconds'] power_data = power_data.diff().dropna() power_data.rename(columns={'seconds': 'elapsed_time'}, inplace=True) power_data = power_data.loc[(power_data != 0).all( axis=1)] # Will drop any row that is all 0's pkg_energy_cols = [ s for s in power_data.keys() if 'pkg_energy' in s ] dram_energy_cols = [ s for s in power_data.keys() if 'dram_energy' in s ] power_data['socket_power'] = power_data[pkg_energy_cols].sum( axis=1) / power_data['elapsed_time'] power_data['dram_power'] = power_data[dram_energy_cols].sum( axis=1) / power_data['elapsed_time'] power_data['combined_power'] = power_data[ 'socket_power'] + power_data['dram_power'] pandas.set_option('display.width', 100) launcher.write_log( name, 'Power stats from {} :\n{}'.format(nn, power_data.describe())) all_power_data[nn] = power_data for node_name, power_data in all_power_data.iteritems(): # Allow for overages of 2% at the 75th percentile. self.assertGreater(self._options['power_budget'] * 1.02, power_data['combined_power'].quantile(.75))
def test_region_runtimes(self): name = 'test_region_runtime' report_path = name + '.report' trace_path = name + '.trace' num_node = 4 num_rank = 16 loop_count = 500 app_conf = geopm_io.AppConf(name + '_app.config') self._tmp_files.append(app_conf.get_path()) app_conf.append_region('dgemm', 8.0) app_conf.set_loop_count(loop_count) ctl_conf = geopm_io.CtlConf(name + '_ctl.config', self._mode, self._options) self._tmp_files.append(ctl_conf.get_path()) launcher = geopm_launcher.factory(app_conf, ctl_conf, report_path, trace_path, time_limit=15) launcher.set_num_node(num_node) launcher.set_num_rank(num_rank) launcher.run(name) self._output = geopm_io.AppOutput(report_path, trace_path) node_names = self._output.get_node_names() self.assertEqual(len(node_names), num_node) # Calculate region times from traces region_times = collections.defaultdict( lambda: collections.defaultdict(dict)) for nn in node_names: tt = self._output.get_trace(nn).set_index( ['region_id'], append=True).groupby(level=['region_id']) for region_id, data in tt: if region_id != '0': # Build a df with only the first region entry and the exit. last_index = 0 filtered_df = pandas.DataFrame() row_list = [] progress_1s = data['progress-0'].loc[data['progress-0'] == 1] for index, junk in progress_1s.iteritems(): row = data.ix[last_index:index].head(1) row_list += [row[['seconds', 'progress-0']]] row = data.ix[last_index:index].tail(1) row_list += [row[['seconds', 'progress-0']]] last_index = index[ 0] + 1 # Set the next starting index to be one past where we are filtered_df = pandas.concat(row_list) filtered_df = filtered_df.diff() # Since I'm not separating out the progress 0's from 1's, when I do the diff I only care about the # case where 1 - 0 = 1 for the progress column. filtered_df = filtered_df.loc[filtered_df['progress-0'] == 1] if len(filtered_df) > 1: launcher.write_log(name, 'Region elapsed time stats from {} - {} :\n{}'\ .format(nn, region_id, filtered_df['seconds'].describe())) filtered_df['seconds'].describe() region_times[nn][region_id] = filtered_df launcher.write_log(name, '{}'.format('-' * 80)) # Loop through the reports to see if the region runtimes line up with what was calculated from the trace files above. write_regions = True for nn in node_names: rr = self._output.get_report(nn) for region_name, region in rr.iteritems(): if region.get_id() != 0 and region.get_count() > 1: if write_regions: launcher.write_log( name, 'Region {} is {}.'.format(region.get_id(), region_name)) self.assertNear( region.get_runtime(), region_times[nn][region.get_id()]['seconds'].sum()) write_regions = False