def write_server_run_script(): assert servertype == 'bridges' # NOTE: to edit according to the configuration needed. jobname = jobtype time_budget_in_hours = 48 # max 48 hours mem_budget_in_gb = 16 partition_name = 'GPU-shared' num_cpus = 1 # probably ask a CPU for each GPU (or more if you have data loaders) num_gpus = 1 # up to 4 if k80, up to 2 if p100 gpu_type = 'k80' # in ['k80', 'p100'] script_header = [ '#!/bin/bash', '#SBATCH --nodes=1', '#SBATCH --partition=%s' % partition_name, '#SBATCH --cpus-per-task=%d' % num_cpus, '#SBATCH --gres=gpu:%s:%d' % (gpu_type, num_gpus), '#SBATCH --mem=%dM' % tb_rs.convert_between_byte_units( mem_budget_in_gb, src_units='gigabytes', dst_units='megabytes'), '#SBATCH --time=%d' % tb_lg.convert_between_time_units( time_budget_in_hours, src_units='hours', dst_units='minutes'), '#SBATCH --job-name=%s' % jobname, ] # NOTE: changes to the environment can be put in the run script. script_body = [ 'module load tensorflow/1.5_gpu', 'PYTHONPATH=%s:$PYTHONPATH' % remote_folderpath, 'python -u %s > log_%s.txt' % (main_relfilepath, jobname) ] script_filepath = tb_fs.join_paths([local_folderpath, "run.sh"]) tb_io.write_textfile(script_filepath, script_header + [''] + script_body) subprocess.check_output(['chmod', '+x', script_filepath])
def memory_since_last(self, units='megabytes'): mem_now = self.memory_total('bytes') mem_dif = mem_now - self.last_registered self.last_registered = mem_now return tb_rs.convert_between_byte_units(mem_dif, dst_units=units)
def get_gpu_information(): gpus = [] try: convert_to_gigabytes = lambda x: tb_rs.convert_between_byte_units( x, src_units='megabytes', dst_units='gigabytes') out = subprocess.check_output([ 'nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total', '--format=csv,noheader' ]) gpu_s_lst = out.strip().split('\n') for i, s in enumerate(gpu_s_lst): utilization_s, memory_s, total_memory_s = s.split(', ') gpus.append({ 'gpu_id': i, 'gpu_utilization_in_percent': float(utilization_s.split()[0]), 'gpu_memory_utilization_in_gigabytes': convert_to_gigabytes(float(memory_s.split()[0])), 'gpu_total_memory_in_gigabytes': convert_to_gigabytes(float(total_memory_s.split()[0])) }) except OSError: pass return gpus
def run_on_matrix(bash_command, servername, username, password=None, num_cpus=1, num_gpus=0, mem_budget=8.0, time_budget=60.0, mem_units='gigabytes', time_units='minutes', folderpath=None, wait_for_output=True, require_gpu_type=None, run_on_head_node=False, jobname=None): assert (not run_on_head_node) or num_gpus == 0 assert require_gpu_type is None ### NOT IMPLEMENTED YET. # prompts for password if it has not been provided if password == None: password = getpass.getpass() script_cmd = "\n".join(['#!/bin/bash', bash_command]) script_name = "run_%s.sh" % uuid.uuid4() # either do the call using sbatch, or run directly on the head node. if not run_on_head_node: cmd_parts = [ 'srun' if wait_for_output else 'sbatch', '--cpus-per-task=%d' % num_cpus, '--gres=gpu:%d' % num_gpus, '--mem=%d' % tb_rs.convert_between_byte_units( mem_budget, src_units=mem_units, dst_units='megabytes'), '--time=%d' % tb_lg.convert_between_time_units( time_budget, time_units, dst_units='minutes') ] if jobname is not None: cmd_parts += ['--job-name=%s' % jobname] cmd_parts += [script_name] run_script_cmd = ' '.join(cmd_parts) else: run_script_cmd = './' + script_name # actual command to run remotely remote_cmd = " && ".join([ "echo \'%s\' > %s" % (script_cmd, script_name), "chmod +x %s" % script_name, run_script_cmd, "rm %s" % script_name ]) return run_on_server( remote_cmd, **tb_ut.subset_dict_via_selection(locals(), [ 'servername', 'username', 'password', 'folderpath', 'wait_for_output' ]))
def run(self, run_only_if_enough_resources_for_all=True): args = tb_ut.subset_dict_via_selection( vars(self), ['servername', 'username', 'password']) args['abort_if_any_node_unavailable'] = False # get the resource availability and filter out unavailable nodes. d = get_lithium_resource_availability(**args) d = {k: v for (k, v) in d.iteritems() if v is not None} g = get_lithium_nodes() # assignments to each of the registered jobs run_cfgs = [] for x in self.jobs: if x['require_nodes'] is not None: req_nodes = x['require_nodes'] else: req_nodes = d.keys() # based on the gpu type restriction. if x['require_gpu_types'] is not None: req_gpu_nodes = tb_ut.flatten( tb_ut.subset_dict_via_selection(g, x['require_gpu_types'])) else: # NOTE: only consider the nodes that are available anyway. req_gpu_nodes = d.keys() # potentially available nodes to place this job. nodes = list(set(req_nodes).intersection(req_gpu_nodes)) assert len(nodes) > 0 # greedy assigned to a node. assigned = False for n in nodes: r = d[n] # if there are enough resources on the node, assign it to the # job. if ((r['cpus_free'] >= x['num_cpus']) and (r['gpus_free'] >= x['num_gpus']) and (r['mem_mbs_free'] >= tb_rs.convert_between_byte_units( x['mem_budget'], src_units=x['mem_units'], dst_units='megabytes'))): # record information about where to run the job. run_cfgs.append({ 'node': n, 'visible_gpu_ids': r['free_gpu_ids'][:x['num_gpus']] }) # deduct the allocated resources from the available resources # for that node. r['cpus_free'] -= x['num_cpus'] r['gpus_free'] -= x['num_gpus'] r['mem_mbs_free'] -= tb_rs.convert_between_byte_units( x['mem_budget'], src_units=x['mem_units'], dst_units='megabytes') r['free_gpu_ids'] = r['free_gpu_ids'][x['num_gpus']:] # assigned = True break # if not assigned, terminate without doing anything. if not assigned: run_cfgs.append(None) if run_only_if_enough_resources_for_all: print("Insufficient resources to satisfy" " (cpus=%d, gpus=%d, mem=%0.3f%s)" % (x['num_cpus'], x['num_gpus'], x['mem_budget'], x['mem_units'])) return None # running the jobs that have a valid config. remaining_jobs = [] outs = [] for x, c in zip(self.jobs, run_cfgs): if c is None: remaining_jobs.append(x) else: out = run_on_lithium_node(**tb_ut.merge_dicts([ tb_ut.subset_dict_via_selection(vars( self), ['servername', 'username', 'password']), tb_ut.subset_dict_via_selection(x, [ 'bash_command', 'folderpath', 'wait_for_output', 'run_on_head_node' ]), tb_ut.subset_dict_via_selection( c, ['node', 'visible_gpu_ids']) ])) outs.append(out) self.jobs = remaining_jobs return outs
def memory_process(pid, units='megabytes'): psutil_p = psutil.Process(pid) mem_p = psutil_p.memory_info()[0] return tb_rs.convert_between_byte_units(mem_p, dst_units=units)
def memory_max(self, units='megabytes'): return tb_rs.convert_between_byte_units(self.max_registered, dst_units=units)
def memory_total(self, units='megabytes'): mem_now = memory_process(os.getpid(), units) if self.max_registered < mem_now: self.max_registered = mem_now return tb_rs.convert_between_byte_units(mem_now, dst_units=units)