def download(self): for url in self.src: file_name = url.split('/')[-1] file_path = os.path.join(self.builddir, file_name) if not os.path.exists(file_path): syscmd(f'wget --no-check-certificate {url} -O {file_path}')
def run(self, redirect=0): #logging.info(f'{"Output":7} : {os.path.relpath(self.output, self.rootdir)}') logging.info(f'{"Output":7} : {os.path.join(self.outdir, self.output)}') # redirect output to file if redirect: syscmd(self.runcmd, self.output) else: syscmd(self.runcmd) self.parse() time.sleep(5)
def check_prerequisite(self, module, min_ver): # insert hostname after ssh cmd = prerequisite.cmd[module].replace('ssh', f'{ssh_cmd} {self.nodelist[0]}') regex = prerequisite.regex[module] version = re.search(regex, syscmd(cmd)).group(1) if packaging.version.parse(version) < packaging.version.parse(min_ver): logging.error(f'{module} >= {min_ver} is required by {self.name}') sys.exit()
def nvidia_smi(node): device = {} nvidia_smi = syscmd(f'{ssh_cmd} {node} "nvidia-smi -L"') for line in nvidia_smi.splitlines(): id, name, uuid = re.search('^GPU (\d+): (.+?) \(UUID: (.+?)\)', line).groups() device[id] = [name, uuid] return device
def gpu_affinity(node): affinity = [] topology = syscmd(f'{ssh_cmd} {node} "nvidia-smi topo -m"') for line in topology.splitlines(): if re.search('^GPU\d+', line): numa = line.split()[-1] if re.search('^\d+$', numa): affinity.append(numa) else: affinity.append('0') return affinity
def device_query(node, builddir='./'): # requirement to build deviceQuery sample_url = [ 'https://raw.githubusercontent.com/NVIDIA/cuda-samples/master/Common/helper_cuda.h', 'https://raw.githubusercontent.com/NVIDIA/cuda-samples/master/Common/helper_string.h', 'https://raw.githubusercontent.com/NVIDIA/cuda-samples/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp' ] # download cuda samples for url in sample_url: file_name = url.split('/')[-1] file_path = os.path.join(builddir, file_name) if not os.path.exists(file_path): syscmd(f'wget {url} -O {file_path}') # build deviceQuerry on host syscmd(f'builtin cd {builddir}; nvcc -I. deviceQuery.cpp -o deviceQuery') # execute deviceQuerry in remote host query = syscmd( f'{ssh_cmd} {node} ' f'"cd {builddir}; module load {" ".join(get_module())}; ./deviceQuery"' ) for line in query.splitlines(): if re.search('\/ Runtime Version', line): runtime = line.split()[-1] if re.search('Minor version number', line): cuda_cc = line.split()[-1].replace('.', '') break # clean up # for file in ['deviceQuery.cpp', 'helper_cuda.h', 'helper_string.h', 'deviceQuery']: # os.remove(f'hotexamples_com/{file}') return runtime, cuda_cc
def lscpu(node): host = {} numa = [] lscpu = syscmd(f'{ssh_cmd} {node} lscpu') for line in lscpu.splitlines(): if re.search('^CPU\(s\)', line): host['CPUs'] = int(line.split()[-1]) if re.search('Model name', line): host['Model'] = ' '.join(line.split()[2:]) if re.search('Thread\(s\)', line): host['Threads'] = line.split()[-1] if re.search('^NUMA node\d+', line): numa.append(line.split()[-1]) if re.search('Flags', line): avx = re.findall('(avx\w*)\s+', line) host['AVXs'] = ', '.join([flag.upper() for flag in avx]) host['NUMA'] = numa return host
def gpu_memory(node): memory = syscmd( f'{ssh_cmd} {node} "nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader"' ).split()[0] return int(memory)
def build(self): for cmd in self.buildcmd: syscmd(cmd)
def cpu_memory(host): mem_kb = syscmd( f'{ssh_cmd} {host} grep MemTotal /proc/meminfo').split()[1] * 1 return int(mem_kb)