def cdt_2105_skip(self): # cray-hdf5 is supported only on PrgEnv-nvidia for cdt >= 21.05 if self.current_environ.name == 'PrgEnv-nvidia': self.skip_if( osext.cray_cdt_version() < '21.05', "cray-hdf5 is not supported for cdt < 21.05 on PrgEnv-nvidia") elif self.current_environ.name == 'PrgEnv-pgi': self.skip_if( osext.cray_cdt_version() >= '21.05', "cray-hdf5 is not supported for cdt >= 21.05 on PrgEnv-pgi")
def cdt2008_pgi_workaround(self): cdt = osext.cray_cdt_version() if not cdt: return if (self.current_environ.name == 'PrgEnv-pgi' and cdt == '20.08'): self.variables.update({'CUDA_HOME': '$CUDATOOLKIT_HOME'})
def skip_modules(self): # FIXME: These modules should be fixed in later releases cdt = osext.cray_cdt_version() if ((cdt and cdt <= '20.11' and self.cray_module in { 'cray-petsc-complex', 'cray-petsc-complex-64', 'cudatoolkit' })): self.valid_systems = []
def test_cray_cdt_version_unknown_fmt(tmp_path, monkeypatch): # Mock up a CDT file rcfile = tmp_path / 'rcfile' with open(rcfile, 'w') as fp: fp.write('random stuff') monkeypatch.setenv('MODULERCFILE', str(rcfile)) assert osext.cray_cdt_version() is None
def test_cray_cdt_version(tmp_path, monkeypatch): # Mock up a CDT file rcfile = tmp_path / 'rcfile' with open(rcfile, 'w') as fp: fp.write('#%Module CDT 20.06\nblah blah\n') monkeypatch.setenv('MODULERCFILE', str(rcfile)) assert osext.cray_cdt_version() == '20.06'
def cdt2006_workaround_intel(self): if (self.current_environ.name == 'PrgEnv-intel' and osext.cray_cdt_version() == '20.06'): self.modules += ['cray-netcdf-hdf5parallel'] self.prebuild_cmds = [ 'ln -s $CRAY_NETCDF_HDF5PARALLEL_PREFIX/lib/pkgconfig/' 'netcdf-cxx4_parallel.pc netcdf_c++4_parallel.pc' ] self.variables['PKG_CONFIG_PATH'] = '.:$PKG_CONFIG_PATH'
def cdt2006_workaround_dynamic(self): if (osext.cray_cdt_version() == '20.06' and self.linkage == 'dynamic' and self.current_environ.name == 'PrgEnv-gnu'): self.variables['PATH'] = ( '/opt/cray/pe/cce/10.0.1/cce-clang/x86_64/bin:$PATH') self.prgenv_flags[self.current_environ.name] += ['-fuse-ld=lld'] # GCC >= 9 is required for the above option; our CUDA-friendly CDT # uses GCC 8 as default. self.modules += ['gcc/9.3.0']
def __init__(self): super().__init__() self.valid_systems = ['daint:login', 'dom:login'] # FIXME: These modules should be fixed in later releases cdt = osext.cray_cdt_version() if ((cdt and cdt <= '20.11' and self.cray_module in { 'cray-petsc-complex', 'cray-petsc-complex-64', 'cudatoolkit' })): self.valid_systems = []
def __init__(self): self.descr = 'Distributed training with TensorFlow using ipyparallel' self.valid_systems = ['daint:gpu', 'dom:gpu'] self.valid_prog_environs = ['PrgEnv-gnu'] cray_cdt_version = osext.cray_cdt_version() # FIXME: The following will not be needed after the Daint upgrade if self.current_system.name == 'dom': self.modules = [ 'ipcmagic', f'Horovod/0.21.0-CrayGNU-{cray_cdt_version}-tf-2.4.0' ] else: self.modules = [ 'ipcmagic', 'Horovod/0.19.1-CrayGNU-20.08-tf-2.2.0' ] self.num_tasks = 2 self.num_tasks_per_node = 1 self.executable = 'ipython' self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py'] nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str) self.sanity_patterns = sn.all( [sn.assert_ne(nids, []), sn.assert_ne(nids[0], nids[1])]) self.reference = { 'daint:gpu': { 'slope': (2.0, -0.1, 0.1, None), 'offset': (0.0, -0.1, 0.1, None), 'retries': (0, None, None, None), 'time': (10, None, None, 's'), }, 'dom:gpu': { 'slope': (2.0, -0.1, 0.1, None), 'offset': (0.0, -0.1, 0.1, None), 'retries': (0, None, None, None), 'time': (10, None, None, 's'), } } self.perf_patterns = { 'slope': sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout, 'slope', float), 'offset': sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout, 'offset', float), 'retries': 4 - sn.count(sn.findall(r'IPCluster is already running', self.stdout)), 'time': sn.extractsingle( r'IPCluster is ready\!\s+' r'\((?P<time>\d+) seconds\)', self.stdout, 'time', float) } self.maintainers = ['RS', 'TR'] self.tags = {'production'}
def cdt_pgi_workaround(self): cdt = osext.cray_cdt_version() if not cdt: return if cdt == '20.08': self.build_system.fflags += [ 'CUDA_HOME=$CUDATOOLKIT_HOME', '-Mcuda=cuda10.2' ] else: # FIXME: PGI 20.x does not support CUDA 11, see case #275674 self.modules += ['cudatoolkit/10.2.89_3.29-7.0.2.1_3.5__g67354b4']
def __init__(self): super().__init__() self.valid_systems = ['daint:login', 'dom:login'] # FIXME: These modules should be fixed in later releases, # while gcc was fixed in 20.11 cdt = osext.cray_cdt_version() if ((cdt and cdt <= '20.11' and self.cray_module in ['cray-petsc-complex', 'cray-petsc-complex-64', 'cudatoolkit']) or (cdt and cdt < '20.11' and module_name == 'gcc')): self.valid_systems = []
def test_cray_cdt_version_no_such_file(tmp_path, monkeypatch): # Mock up a CDT file rcfile = tmp_path / 'rcfile' monkeypatch.setenv('MODULERCFILE', str(rcfile)) assert osext.cray_cdt_version() is None
def cdt_2105_workaround(self): # FIXME: The mkl libraries are not found in cdt 21.05, CASE #285117 if osext.cray_cdt_version() == '21.05': self.build_system.ldflags += [ '-L/opt/intel/oneapi/mkl/latest/lib/intel64/' ]
def __init__(self, model, mpi_task): self.descr = 'Distributed training with Pytorch and Horovod' self.valid_systems = ['daint:gpu'] if mpi_task < 20: self.valid_systems += ['dom:gpu'] self.valid_prog_environs = ['builtin'] cray_cdt_version = osext.cray_cdt_version() self.modules = [f'Horovod/0.19.5-CrayGNU-{cray_cdt_version}-pt-1.6.0'] self.num_tasks_per_node = 1 self.num_cpus_per_task = 12 self.num_tasks = mpi_task batch_size = 64 self.variables = { 'NCCL_DEBUG': 'INFO', 'NCCL_IB_HCA': 'ipogif0', 'NCCL_IB_CUDA_SUPPORT': '1', 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', } hash = 'master' git_url = f'https://raw.githubusercontent.com/horovod/horovod/{hash}/examples/pytorch' # noqa: E501 git_src = 'pytorch_synthetic_benchmark.py' self.prerun_cmds = [f'wget {git_url}/{git_src}'] if model == 'inception_v3': self.prerun_cmds += [ 'python3 -m venv --system-site-packages myvenv', 'source myvenv/bin/activate', 'pip install scipy', 'sed -i "s-output = model(data)-output, aux = model(data)-"' f' {git_src}', 'sed -i "s-data = torch.randn(args.batch_size, 3, 224, 224)-' f'data = torch.randn(args.batch_size, 3, 299, 299)-"' f' {git_src}' ] self.executable = 'python' self.executable_opts = [ git_src, f'--model {model}', f'--batch-size {batch_size}', '--num-iters 5', '--num-batches-per-iter 5' ] self.tags = {'production'} self.maintainers = ['RS', 'HM'] self.sanity_patterns = sn.all([ sn.assert_found(rf'Model: {model}', self.stdout), sn.assert_found(rf'Batch size: {batch_size}', self.stdout) ]) self.perf_patterns = { 'throughput_per_gpu': sn.extractsingle( r'Img/sec per GPU: (?P<throughput_per_gpu>\S+) \S+', self.stdout, 'throughput_per_gpu', float ), 'throughput_per_job': sn.extractsingle( r'Total img/sec on \d+ GPU\(s\): (?P<throughput>\S+) \S+', self.stdout, 'throughput', float ), } ref_per_gpu = 131 if model == 'inception_v3' else 201 ref_per_job = ref_per_gpu * mpi_task self.reference = { 'dom:gpu': { 'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'), 'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'), }, 'daint:gpu': { 'throughput_per_gpu': (ref_per_gpu, -0.1, None, 'images/s'), 'throughput_per_job': (ref_per_job, -0.1, None, 'images/s'), } }
def cdt2008_pgi_workaround(self): if (self.current_environ.name == 'PrgEnv-pgi' and osext.cray_cdt_version() == '20.08' and self.current_system.name in ['daint', 'dom']): self.variables['CUDA_HOME'] = '$CUDATOOLKIT_HOME'