Exemple #1
0
 def assert_count_gpus(self):
     return sn.all([
         sn.assert_eq(
             sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
                                 self.stdout)), self.num_tasks_assigned),
         sn.assert_eq(
             sn.count(
                 sn.findall(
                     r'\[\S+\] \[gpu \d+\] Kernel launch '
                     r'latency: \S+ us', self.stdout)),
             self.num_tasks_assigned * self.num_gpus_per_node)
     ])
def test_sanity_multiple_patterns(dummytest, sanity_file, dummy_gpu_exec_ctx):
    sanity_file.write_text('result1 = success\n' 'result2 = success\n')

    # Simulate a pure sanity test; reset the perf_patterns
    dummytest.perf_patterns = None
    dummytest.sanity_patterns = sn.assert_eq(
        sn.count(sn.findall(r'result\d = success', sanity_file)), 2)
    _run_sanity(dummytest, *dummy_gpu_exec_ctx, skip_perf=True)

    # Require more patterns to be present
    dummytest.sanity_patterns = sn.assert_eq(
        sn.count(sn.findall(r'result\d = success', sanity_file)), 3)
    with pytest.raises(SanityError):
        _run_sanity(dummytest, *dummy_gpu_exec_ctx, skip_perf=True)
Exemple #3
0
 def assert_count_gpus(self):
     '''Assert GPU count is consistent.'''
     return sn.all([
         sn.assert_eq(
             sn.count(sn.findall(r'\[\S+\] Found \d+ gpu\(s\)',
                                 self.stdout)),
             sn.getattr(self.job, 'num_tasks')),
         sn.assert_eq(
             sn.count(
                 sn.findall(
                     r'\[\S+\] \[gpu \d+\] Kernel launch '
                     r'latency: \S+ us', self.stdout)),
             self.job.num_tasks * self.num_gpus_per_node)
     ])
    def __init__(self):
        super().__init__()
        self.valid_systems = ['daint:gpu', 'dom:gpu']
        self.valid_prog_environs = ['PrgEnv-cray']
        self.descr = 'Flexible Cuda Memtest'
        self.maintainers = ['TM', 'VK']
        self.num_tasks_per_node = 1
        self.num_tasks = 0
        self.num_gpus_per_node = 1
        self.modules = ['cudatoolkit']
        self.sourcesdir = None
        src_url = ('https://downloads.sourceforge.net/project/cudagpumemtest/'
                   'cuda_memtest-1.2.3.tar.gz')
        self.prebuild_cmd = [
            'wget %s' % src_url,
            'tar -xzf cuda_memtest-1.2.3.tar.gz --strip-components=1'
        ]
        self.executable = 'cuda_memtest_sm20'
        self.executable_opts = ['--disable_test', '6', '--num_passes', '1']

        valid_test_ids = {i for i in range(11) if i not in {6, 9}}
        assert_finished_tests = [
            sn.assert_eq(
                sn.count(sn.findall('Test%s finished' % test_id, self.stdout)),
                self.num_tasks_assigned)
            for test_id in valid_test_ids
        ]
        self.sanity_patterns = sn.all([
            *assert_finished_tests,
            sn.assert_not_found('(?i)ERROR', self.stdout),
            sn.assert_not_found('(?i)ERROR', self.stderr)])
Exemple #5
0
    def __init__(self):
        super().__init__()
        self.sourcepath = 'strides.cpp'
        self.build_system = 'SingleSource'
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 1
        self.num_tasks_per_node = 1

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'bandwidth', self.stdout)),
            self.num_tasks_assigned)

        self.perf_patterns = {
            'bandwidth':
            sn.extractsingle(r'bandwidth: (?P<bw>\S+) GB/s', self.stdout, 'bw',
                             float)
        }

        self.system_num_cpus = {
            'daint:mc': 72,
            'daint:gpu': 24,
            'dom:mc': 72,
            'dom:gpu': 24,
        }

        self.maintainers = ['SK']
        self.tags = {'benchmark', 'diagnostic'}
    def __init__(self):
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'tiger:gpu']
        self.valid_prog_environs = ['PrgEnv-cray']
        self.descr = 'Flexible CUDA Memtest'
        self.maintainers = ['TM', 'SK']
        self.num_tasks_per_node = 1
        self.num_tasks = 0
        self.num_gpus_per_node = 1
        self.modules = ['cudatoolkit']
        src_url = ('https://downloads.sourceforge.net/project/cudagpumemtest/'
                   'cuda_memtest-1.2.3.tar.gz')
        self.prebuild_cmd = [
            'wget %s' % src_url, 'tar -xzf cuda_memtest-1.2.3.tar.gz',
            'cd cuda_memtest-1.2.3', 'patch -p1 < ../cuda_memtest-1.2.3.patch'
        ]
        self.build_system = 'Make'
        self.executable = './cuda_memtest-1.2.3/cuda_memtest'
        self.executable_opts = ['--disable_test', '6', '--num_passes', '1']

        valid_test_ids = {i for i in range(11) if i not in {6, 9}}
        assert_finished_tests = [
            sn.assert_eq(
                sn.count(sn.findall('Test%s finished' % test_id, self.stdout)),
                self.num_tasks_assigned) for test_id in valid_test_ids
        ]
        self.sanity_patterns = sn.all([
            *assert_finished_tests,
            sn.assert_not_found('(?i)ERROR', self.stdout),
            sn.assert_not_found('(?i)ERROR', self.stderr)
        ])
        self.tags = {'diagnostic', 'ops', 'craype'}
Exemple #7
0
    def __init__(self):
        self.sourcepath = 'strides.cpp'
        self.build_system = 'SingleSource'
        self.valid_systems = [
            'cannon:local', 'cannon:local-gpu', 'cannon:gpu_test',
            'cannon:test', 'fasse:fasse', 'test:rc-testing'
        ]
        self.valid_prog_environs = ['builtin', 'gnu', 'gpu', 'intel']
        self.build_system.cxxflags = ['-std=c++11', '-lpthread']
        self.num_tasks = 1
        self.num_tasks_per_node = 1

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'bandwidth', self.stdout)),
            self.num_tasks_assigned)

        self.perf_patterns = {
            'bandwidth':
            sn.extractsingle(r'bandwidth: (?P<bw>\S+) GB/s', self.stdout, 'bw',
                             float)
        }

        self.system_num_cpus = {
            'cannon:local': 48,
            'cannon:local-gpu': 32,
            'cannon:gpu_test': 16,
            'cannon:test': 48,
            'fasse:fasse': 48,
            'test:rc-testing': 36,
            '*': 32,
        }
 def __init__(self):
     super().__init__()
     self.descr = ('OpenFOAM-Extend  check of interMixingFoam: '
                   'dambreak tutorial')
     self.sanity_patterns = sn.assert_eq(
         sn.count(sn.findall(r'Air phase volume fraction', self.stdout)),
         2944)
Exemple #9
0
def program_begin_count(obj):
    '''Reports the number of ``PROGRAM_BEGIN`` in the otf2 file
    (trace validation)
    '''
    pg_begin_count = sn.count(sn.findall(r'^(?P<wl>PROGRAM_BEGIN)\s+',
                                         obj.rpt))
    return pg_begin_count
Exemple #10
0
    def setup(self, partition, environ, **job_opts):
        if partition.fullname in ['daint:gpu', 'dom:gpu']:
            self.num_tasks_per_node = 2
            self.num_cpus_per_task = 12
        else:
            self.num_tasks_per_node = 4
            self.num_cpus_per_task = 18

        # since this is a flexible test, we divide the extracted
        # performance by the number of nodes and compare
        # against a single reference
        num_nodes = self.num_tasks_assigned / self.num_tasks_per_node
        self.perf_patterns = {
            'gflops':
            sn.extractsingle(
                r'HPCG result is VALID with a GFLOP\/s rating of:\s*'
                r'(?P<perf>\S+)', self.outfile_lazy, 'perf', float) / num_nodes
        }

        self.sanity_patterns = sn.all([
            sn.assert_eq(4, sn.count(sn.findall(r'PASSED',
                                                self.outfile_lazy))),
            sn.assert_eq(0, self.num_tasks_assigned % self.num_tasks_per_node)
        ])

        super().setup(partition, environ, **job_opts)
Exemple #11
0
    def __init__(self, kernel_version):
        super().__init__()
        self.sourcepath = 'shmem.cu'
        self.build_system = 'SingleSource'
        self.valid_systems = ['daint:gpu', 'dom:gpu']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 0
        self.num_tasks_per_node = 1

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'Bandwidth', self.stdout)),
            self.num_tasks_assigned * 2)

        self.perf_patterns = {
            'bandwidth': sn.extractsingle(
                r'Bandwidth\(double\) (?P<bw>\S+) GB/s',
                self.stdout, 'bw', float)
        }
        # theoretical limit:
        # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s
        self.reference = {
            'dom:gpu': {
                'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s')
            },
            'daint:gpu': {
                'bandwidth': (8850, -0.01, 1. - 9520/8850, 'GB/s')
            },
        }

        self.maintainers = ['SK']
        self.tags = {'benchmark', 'diagnostic'}
Exemple #12
0
 def test_sanity_failure_noassert(self):
     self.test.sanity_patterns = sn.findall(r'result = success',
                                            self.output_file.name)
     self.output_file.write('result = failure\n')
     self.output_file.close()
     with pytest.raises(SanityError):
         self.test.check_sanity()
Exemple #13
0
    def count_successful_burns(self):
        '''Set the sanity patterns to count the number of successful burns.'''

        return sn.assert_eq(
            sn.count(
                sn.findall(r'^\s*\[[^\]]*\]\s*GPU\s*\d+\(OK\)', self.stdout)),
            self.num_tasks_assigned)
Exemple #14
0
    def setup(self, partition, environ, **job_opts):
        result = sn.findall(
            r'Hello World from thread \s*(\d+) out '
            r'of \s*(\d+) from process \s*(\d+) out of '
            r'\s*(\d+)', self.stdout)

        self.sanity_patterns = sn.all(
            sn.chain(
                [
                    sn.assert_eq(sn.count(result),
                                 self.num_tasks * self.num_cpus_per_task)
                ],
                sn.map(
                    lambda x: sn.assert_lt(int(x.group(1)), int(x.group(2))),
                    result),
                sn.map(
                    lambda x: sn.assert_lt(int(x.group(3)), int(x.group(4))),
                    result),
                sn.map(
                    lambda x: sn.assert_lt(int(x.group(1)), self.
                                           num_cpus_per_task), result),
                sn.map(
                    lambda x: sn.assert_eq(int(x.group(2)), self.
                                           num_cpus_per_task), result),
                sn.map(lambda x: sn.assert_lt(int(x.group(3)), self.num_tasks),
                       result),
                sn.map(lambda x: sn.assert_eq(int(x.group(4)), self.num_tasks),
                       result),
            ))

        self.perf_patterns = {
            'compilation_time': sn.getattr(self, 'compilation_time_seconds')
        }
        self.reference = {'*': {'compilation_time': (60, None, 0.1)}}
        super().setup(partition, environ, **job_opts)
Exemple #15
0
       def __init__(self, name, *args, **kwargs):
          if name is not '':
             name += '_'
          super().__init__('{0}{1}runs'.format(name,self.multi_rep), 
                           *args, **kwargs)

          # scale the assumed runtime
          self.time_limit = (self.time_limit[0]*self.multi_rep+
                                int((self.time_limit[1]*self.multi_rep)/60), 
                             (self.time_limit[1]*self.multi_rep) % 60+
                                int((self.time_limit[2]*self.multi_rep) /60), 
                             (self.time_limit[2]*self.multi_rep) % 60)

          # check if we got #multi_rep the the sanity patern
          if hasattr(self, 'multirun_san_pat'):
             self.sanity_patterns = sn.assert_eq(sn.count(
                sn.findall(*self.multirun_san_pat)), self.multi_rep)

          # create the list of result values: first the average and  
          #   then all single elements (to be stored)
          if hasattr(self, 'multirun_perf_pat'):
             self.perf_patterns = {}
             for key in list(self.multirun_perf_pat.keys()):
                self.perf_patterns[key] = sn.avg(
                   sn.extractall(*(self.multirun_perf_pat[key])))
                for run in range(0,self.multi_rep):
                   self.perf_patterns[key+"_{}".format(run)] = sn.extractall(
                      *(self.multirun_perf_pat[key]))[run]
Exemple #16
0
    def __init__(self):
        self.sourcepath = 'shmem.cu'
        self.build_system = 'SingleSource'
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'tiger:gpu']
        self.valid_prog_environs = ['PrgEnv-gnu']
        self.num_tasks = 0
        self.num_tasks_per_node = 1
        self.num_gpus_per_node = 1
        if self.current_system.name in {'daint', 'dom', 'tiger'}:
            self.modules = ['craype-accel-nvidia60']

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'Bandwidth', self.stdout)),
            self.num_tasks_assigned * 2)

        self.perf_patterns = {
            'bandwidth':
            sn.extractsingle(r'Bandwidth\(double\) (?P<bw>\S+) GB/s',
                             self.stdout, 'bw', float)
        }
        self.reference = {
            # theoretical limit for P100:
            # 8 [B/cycle] * 1.328 [GHz] * 16 [bankwidth] * 56 [SM] = 9520 GB/s
            'dom:gpu': {
                'bandwidth': (8850, -0.01, 9520 / 8850. - 1, 'GB/s')
            },
            'daint:gpu': {
                'bandwidth': (8850, -0.01, 9520 / 8850. - 1, 'GB/s')
            }
        }

        self.maintainers = ['SK']
        self.tags = {'benchmark', 'diagnostic', 'craype'}
Exemple #17
0
    def test_sanity_multiple_patterns(self):
        self.output_file.write('result1 = success\n')
        self.output_file.write('result2 = success\n')
        self.output_file.close()

        # Simulate a pure sanity test; invalidate the reference values
        self.test.reference = {}
        self.test.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'result\d = success', self.output_file.name)),
            2)
        self.test.check_sanity()

        # Require more patterns to be present
        self.test.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'result\d = success', self.output_file.name)),
            3)
        self.assertRaises(SanityError, self.test.check_sanity)
Exemple #18
0
    def __init__(self, variant, lang, linkage):
        self.linkage = linkage
        self.variables = {'CRAYPE_LINK_TYPE': linkage}
        self.prgenv_flags = {}
        self.lang_names = {'c': 'C', 'cpp': 'C++', 'f90': 'Fortran 90'}
        self.descr = self.lang_names[lang] + ' Hello World'
        self.sourcepath = 'hello_world'
        self.build_system = 'SingleSource'
        self.valid_systems = ['ubelix:compute', 'ubelix:gpu']

        self.valid_prog_environs = ['foss', 'intel']

        self.compilation_time_seconds = None

        result = sn.findall(
            r'Hello World from thread \s*(\d+) out '
            r'of \s*(\d+) from process \s*(\d+) out of '
            r'\s*(\d+)', self.stdout)

        num_tasks = sn.getattr(self, 'num_tasks')
        num_cpus_per_task = sn.getattr(self, 'num_cpus_per_task')

        def tid(match):
            return int(match.group(1))

        def num_threads(match):
            return int(match.group(2))

        def rank(match):
            return int(match.group(3))

        def num_ranks(match):
            return int(match.group(4))

        self.sanity_patterns = sn.all(
            sn.chain(
                [
                    sn.assert_eq(sn.count(result),
                                 num_tasks * num_cpus_per_task)
                ],
                sn.map(lambda x: sn.assert_lt(tid(x), num_threads(x)), result),
                sn.map(lambda x: sn.assert_lt(rank(x), num_ranks(x)), result),
                sn.map(lambda x: sn.assert_lt(tid(x), num_cpus_per_task),
                       result),
                sn.map(
                    lambda x: sn.assert_eq(num_threads(x), num_cpus_per_task),
                    result),
                sn.map(lambda x: sn.assert_lt(rank(x), num_tasks), result),
                sn.map(lambda x: sn.assert_eq(num_ranks(x), num_tasks),
                       result),
            ))
        self.perf_patterns = {
            'compilation_time': sn.getattr(self, 'compilation_time_seconds')
        }
        self.reference = {'*': {'compilation_time': (60, None, 0.1, 's')}}

        self.maintainers = ['VH', 'EK']
        self.tags = {'production', 'prgenv'}
Exemple #19
0
 def __init__(self):
     self.valid_systems = ['daint:gpu', 'daint:mc']
     self.valid_prog_environs = ['cray']
     self.executable = 'hostname'
     self.num_tasks = 0
     self.num_tasks_per_node = 1
     self.sanity_patterns = sn.assert_eq(
         sn.getattr(self, 'num_tasks'),
         sn.count(sn.findall(r'^nid\d+$', self.stdout)))
Exemple #20
0
    def __init__(self, **kwargs):
        super().__init__(
            'interMixingFoam',
            'OpenFOA-Extend  check of interMixingFoam: dambreak tutorial',
            **kwargs)

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'Air phase volume fraction', self.stdout)),
            2944)
Exemple #21
0
 def __init__(self):
     super().__init__()
     self.descr = 'OpenFOAM check of interMixingFoam: dambreak tutorial'
     self.sanity_patterns = sn.all([
         sn.assert_eq(
             sn.count(sn.findall('(?P<line>Air phase volume fraction)',
                                 self.stdout)), 2534),
         sn.assert_found(r'^\s*[Ee]nd', self.stdout)
     ])
Exemple #22
0
    def __init__(self, exec_mode):
        self.sourcepath = 'fftw_benchmark.c'
        self.build_system = 'SingleSource'
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn', 'tiger:gpu']
        self.modules = ['cray-fftw']
        self.num_tasks_per_node = 12
        self.num_gpus_per_node = 0
        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'execution time', self.stdout)), 1)
        self.build_system.cflags = ['-O2']
        if self.current_system.name == 'kesch':
            self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-pgi']
            self.build_system.cflags += [
                '-I$FFTW_INC', '-L$FFTW_DIR', '-lfftw3'
            ]
        elif self.current_system.name in {'daint', 'dom', 'tiger'}:
            self.valid_prog_environs = [
                'PrgEnv-cray', 'PrgEnv-pgi', 'PrgEnv-gnu'
            ]

        self.perf_patterns = {
            'fftw_exec_time':
            sn.extractsingle(r'execution time:\s+(?P<exec_time>\S+)',
                             self.stdout, 'exec_time', float),
        }

        if exec_mode == 'nompi':
            self.num_tasks = 12
            self.executable_opts = ['72 12 1000 0']
            self.reference = {
                'dom:gpu': {
                    'fftw_exec_time': (0.55, None, 0.05, 's'),
                },
                'daint:gpu': {
                    'fftw_exec_time': (0.55, None, 0.05, 's'),
                },
                'kesch:cn': {
                    'fftw_exec_time': (0.61, None, 0.05, 's'),
                }
            }
        else:
            self.num_tasks = 72
            self.executable_opts = ['144 72 200 1']
            self.reference = {
                'dom:gpu': {
                    'fftw_exec_time': (0.47, None, 0.50, 's'),
                },
                'daint:gpu': {
                    'fftw_exec_time': (0.47, None, 0.50, 's'),
                },
                'kesch:cn': {
                    'fftw_exec_time': (1.58, None, 0.50, 's'),
                }
            }

        self.maintainers = ['AJ']
        self.tags = {'benchmark', 'scs', 'craype'}
Exemple #23
0
    def __init__(self):
        super().__init__()
        self.valid_systems = ['daint:gpu', 'dom:gpu', 'kesch:cn']
        self.descr = 'GPU burn test'
        self.valid_prog_environs = ['PrgEnv-gnu']

        if self.current_system.name == 'kesch':
            self.exclusive_access = True
            self.modules = ['craype-accel-nvidia35']
            # NOTE: The first option indicates the precision (-d for double)
            #       while the seconds is the time (in secs) to run the test.
            #       For multi-gpu nodes, we run the gpu burn test for more
            #       time to get reliable measurements.
            self.executable_opts = ['-d', '40']
            self.num_gpus_per_node = 16
            gpu_arch = '37'
        else:
            self.modules = ['craype-accel-nvidia60']
            self.executable_opts = ['-d', '20']
            self.num_gpus_per_node = 1
            gpu_arch = '60'

        self.sourcepath = 'gpu_burn.cu'
        self.build_system = 'SingleSource'
        self.build_system.cxxflags = [
            '-arch=compute_%s' % gpu_arch,
            '-code=sm_%s' % gpu_arch
        ]
        self.build_system.ldflags = ['-lcuda', '-lcublas', '-lnvidia-ml']

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall('OK', self.stdout)), self.num_tasks_assigned)

        self.perf_patterns = {
            'perf':
            sn.min(
                sn.extractall(r'GPU\s+\d+\(\S*\): (?P<perf>\S*) GF\/s',
                              self.stdout, 'perf', float))
        }

        self.reference = {
            'dom:gpu': {
                'perf': (4115, -0.10, None)
            },
            'daint:gpu': {
                'perf': (4115, -0.10, None)
            },
            'kesch:cn': {
                'perf': (950, -0.10, None)
            }
        }

        self.num_tasks = 0
        self.num_tasks_per_node = 1

        self.maintainers = ['AJ', 'VK', 'TM']
        self.tags = {'diagnostic', 'benchmark'}
Exemple #24
0
    def test_findall(self):
        res = evaluate(sn.findall('Step: \d+', self.tempfile))
        self.assertEqual(3, len(res))

        res = evaluate(sn.findall('Step:.*', self.tempfile))
        self.assertEqual(3, len(res))

        res = evaluate(sn.findall('Step: [12]', self.tempfile))
        self.assertEqual(2, len(res))

        # Check the matches
        for expected, match in zip(['Step: 1', 'Step: 2'], res):
            self.assertEqual(expected, match.group(0))

        # Check groups
        res = evaluate(sn.findall('Step: (?P<no>\d+)', self.tempfile))
        for step, match in enumerate(res, start=1):
            self.assertEqual(step, int(match.group(1)))
            self.assertEqual(step, int(match.group('no')))
Exemple #25
0
 def validate_passed(self):
     return sn.all([
         sn.assert_not_found(
             r'invalid because the ratio',
             self.outfile_lazy,
             msg='number of processes assigned could not be factorized'),
         sn.assert_eq(4, sn.count(sn.findall(r'PASSED',
                                             self.outfile_lazy))),
         sn.assert_eq(0, self.num_tasks_assigned % self.num_tasks_per_node)
     ])
Exemple #26
0
def test_findall(tempfile):
    res = sn.evaluate(sn.findall(r'Step: \d+', tempfile))
    assert 3 == len(res)

    res = sn.evaluate(sn.findall('Step:.*', tempfile))
    assert 3 == len(res)

    res = sn.evaluate(sn.findall('Step: [12]', tempfile))
    assert 2 == len(res)

    # Check the matches
    for expected, match in zip(['Step: 1', 'Step: 2'], res):
        assert expected == match.group(0)

    # Check groups
    res = sn.evaluate(sn.findall(r'Step: (?P<no>\d+)', tempfile))
    for step, match in enumerate(res, start=1):
        assert step == int(match.group(1))
        assert step == int(match.group('no'))
Exemple #27
0
 def __init__(self):
     self.valid_systems = ['*']
     self.valid_prog_environs = ['*']
     self.sourcepath = 'hello_threads.cpp'
     self.executable_opts = ['16']
     self.build_system = 'SingleSource'
     self.build_system.cxxflags = ['-std=c++11', '-Wall']
     num_messages = sn.len(
         sn.findall(r'\[\s?\d+\] Hello, World\!', self.stdout))
     self.sanity_patterns = sn.assert_eq(num_messages, 16)
Exemple #28
0
    def __init__(self):
        self.descr = 'Distributed training with TensorFlow using ipyparallel'
        self.valid_systems = ['daint:gpu', 'dom:gpu']
        self.valid_prog_environs = ['PrgEnv-gnu']
        cray_cdt_version = osext.cray_cdt_version()
        # FIXME: The following will not be needed after the Daint upgrade
        if self.current_system.name == 'dom':
            self.modules = [
                'ipcmagic',
                f'Horovod/0.21.0-CrayGNU-{cray_cdt_version}-tf-2.4.0'
            ]
        else:
            self.modules = [
                'ipcmagic', 'Horovod/0.19.1-CrayGNU-20.08-tf-2.2.0'
            ]

        self.num_tasks = 2
        self.num_tasks_per_node = 1
        self.executable = 'ipython'
        self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py']
        nids = sn.extractall(r'nid(?P<nid>\d+)', self.stdout, 'nid', str)
        self.sanity_patterns = sn.all(
            [sn.assert_ne(nids, []),
             sn.assert_ne(nids[0], nids[1])])
        self.reference = {
            'daint:gpu': {
                'slope': (2.0, -0.1, 0.1, None),
                'offset': (0.0, -0.1, 0.1, None),
                'retries': (0, None, None, None),
                'time': (10, None, None, 's'),
            },
            'dom:gpu': {
                'slope': (2.0, -0.1, 0.1, None),
                'offset': (0.0, -0.1, 0.1, None),
                'retries': (0, None, None, None),
                'time': (10, None, None, 's'),
            }
        }
        self.perf_patterns = {
            'slope':
            sn.extractsingle(r'slope=(?P<slope>\S+)', self.stdout, 'slope',
                             float),
            'offset':
            sn.extractsingle(r'offset=(?P<offset>\S+)', self.stdout, 'offset',
                             float),
            'retries':
            4 -
            sn.count(sn.findall(r'IPCluster is already running', self.stdout)),
            'time':
            sn.extractsingle(
                r'IPCluster is ready\!\s+'
                r'\((?P<time>\d+) seconds\)', self.stdout, 'time', float)
        }
        self.maintainers = ['RS', 'TR']
        self.tags = {'production'}
Exemple #29
0
    def __init__(self, variant):
        self.descr = 'Distributed training with TensorFlow and Horovod'
        self.valid_systems = ['daint:gpu']
        self.valid_prog_environs = ['PrgEnv-gnu']
        tfshortver = '1.14'
        self.sourcesdir = 'https://github.com/tensorflow/benchmarks'
        self.modules = ['Horovod/0.16.4-CrayGNU-19.06-tf-%s.0' % tfshortver]
        if variant == 'small':
            self.valid_systems += ['dom:gpu']
            self.num_tasks = 8
            self.reference = {
                'dom:gpu': {
                    'throughput': (1133.6, None, 0.05, 'images/s'),
                },
                'daint:gpu': {
                    'throughput': (1134.8, None, 0.05, 'images/s')
                },
            }
        else:
            self.num_tasks = 32
            self.reference = {
                'daint:gpu': {
                    'throughput': (4403.0, None, 0.05, 'images/s')
                },
            }

        self.num_tasks_per_node = 1
        self.num_cpus_per_task = 12
        self.perf_patterns = {
            'throughput':
            sn.avg(
                sn.extractall(r'total images/sec:\s+(?P<throughput>\S+)',
                              self.stdout, 'throughput', float))
        }

        self.sanity_patterns = sn.assert_eq(
            sn.count(sn.findall(r'total images/sec:', self.stdout)),
            self.num_tasks)

        self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver]
        self.variables = {
            'NCCL_DEBUG': 'INFO',
            'NCCL_IB_HCA': 'ipogif0',
            'NCCL_IB_CUDA_SUPPORT': '1',
            'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK',
        }
        self.executable = 'python'
        self.executable_opts = [
            'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py',
            '--model inception3', '--batch_size 64',
            '--variable_update horovod', '--log_dir ./logs',
            '--train_dir ./checkpoints'
        ]
        self.tags = {'production'}
        self.maintainers = ['MS', 'RS']
Exemple #30
0
 def __init__(self):
     super().__init__()
     self.num_tasks = 2
     self.valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', 'kesch:cn',
         'kesch:pn'
     ]
     self.executable = '/bin/echo'
     self.executable_opts = ['$MY_VAR']
     self.variables = {'MY_VAR': 'TEST123456!'}
     num_matches = sn.count(sn.findall(r'TEST123456!', self.stdout))
     self.sanity_patterns = sn.assert_eq(self.num_tasks, num_matches)