Beispiel #1
0
def write_server_run_script():
    assert servertype == 'bridges'
    # NOTE: to edit according to the configuration needed.
    jobname = jobtype
    time_budget_in_hours = 48  # max 48 hours
    mem_budget_in_gb = 16
    partition_name = 'GPU-shared'
    num_cpus = 1  # probably ask a CPU for each GPU (or more if you have data loaders)
    num_gpus = 1  # up to 4 if k80, up to 2 if p100
    gpu_type = 'k80'  # in ['k80', 'p100']

    script_header = [
        '#!/bin/bash',
        '#SBATCH --nodes=1',
        '#SBATCH --partition=%s' % partition_name,
        '#SBATCH --cpus-per-task=%d' % num_cpus,
        '#SBATCH --gres=gpu:%s:%d' % (gpu_type, num_gpus),
        '#SBATCH --mem=%dM' % tb_rs.convert_between_byte_units(
            mem_budget_in_gb, src_units='gigabytes', dst_units='megabytes'),
        '#SBATCH --time=%d' % tb_lg.convert_between_time_units(
            time_budget_in_hours, src_units='hours', dst_units='minutes'),
        '#SBATCH --job-name=%s' % jobname,
    ]
    # NOTE: changes to the environment can be put in the run script.
    script_body = [
        'module load tensorflow/1.5_gpu',
        'PYTHONPATH=%s:$PYTHONPATH' % remote_folderpath,
        'python -u %s > log_%s.txt' % (main_relfilepath, jobname)
    ]

    script_filepath = tb_fs.join_paths([local_folderpath, "run.sh"])
    tb_io.write_textfile(script_filepath, script_header + [''] + script_body)
    subprocess.check_output(['chmod', '+x', script_filepath])
Beispiel #2
0
    def memory_since_last(self, units='megabytes'):
        mem_now = self.memory_total('bytes')

        mem_dif = mem_now - self.last_registered
        self.last_registered = mem_now

        return tb_rs.convert_between_byte_units(mem_dif, dst_units=units)
Beispiel #3
0
def get_gpu_information():
    gpus = []
    try:
        convert_to_gigabytes = lambda x: tb_rs.convert_between_byte_units(
            x, src_units='megabytes', dst_units='gigabytes')
        out = subprocess.check_output([
            'nvidia-smi',
            '--query-gpu=utilization.gpu,memory.used,memory.total',
            '--format=csv,noheader'
        ])

        gpu_s_lst = out.strip().split('\n')
        for i, s in enumerate(gpu_s_lst):
            utilization_s, memory_s, total_memory_s = s.split(', ')
            gpus.append({
                'gpu_id':
                i,
                'gpu_utilization_in_percent':
                float(utilization_s.split()[0]),
                'gpu_memory_utilization_in_gigabytes':
                convert_to_gigabytes(float(memory_s.split()[0])),
                'gpu_total_memory_in_gigabytes':
                convert_to_gigabytes(float(total_memory_s.split()[0]))
            })
    except OSError:
        pass
    return gpus
Beispiel #4
0
def run_on_matrix(bash_command,
                  servername,
                  username,
                  password=None,
                  num_cpus=1,
                  num_gpus=0,
                  mem_budget=8.0,
                  time_budget=60.0,
                  mem_units='gigabytes',
                  time_units='minutes',
                  folderpath=None,
                  wait_for_output=True,
                  require_gpu_type=None,
                  run_on_head_node=False,
                  jobname=None):

    assert (not run_on_head_node) or num_gpus == 0
    assert require_gpu_type is None  ### NOT IMPLEMENTED YET.

    # prompts for password if it has not been provided
    if password == None:
        password = getpass.getpass()

    script_cmd = "\n".join(['#!/bin/bash', bash_command])
    script_name = "run_%s.sh" % uuid.uuid4()

    # either do the call using sbatch, or run directly on the head node.
    if not run_on_head_node:
        cmd_parts = [
            'srun' if wait_for_output else 'sbatch',
            '--cpus-per-task=%d' % num_cpus,
            '--gres=gpu:%d' % num_gpus,
            '--mem=%d' % tb_rs.convert_between_byte_units(
                mem_budget, src_units=mem_units, dst_units='megabytes'),
            '--time=%d' % tb_lg.convert_between_time_units(
                time_budget, time_units, dst_units='minutes')
        ]
        if jobname is not None:
            cmd_parts += ['--job-name=%s' % jobname]
        cmd_parts += [script_name]

        run_script_cmd = ' '.join(cmd_parts)
    else:
        run_script_cmd = './' + script_name

    # actual command to run remotely
    remote_cmd = " && ".join([
        "echo \'%s\' > %s" % (script_cmd, script_name),
        "chmod +x %s" % script_name, run_script_cmd,
        "rm %s" % script_name
    ])

    return run_on_server(
        remote_cmd,
        **tb_ut.subset_dict_via_selection(locals(), [
            'servername', 'username', 'password', 'folderpath',
            'wait_for_output'
        ]))
Beispiel #5
0
    def run(self, run_only_if_enough_resources_for_all=True):
        args = tb_ut.subset_dict_via_selection(
            vars(self), ['servername', 'username', 'password'])
        args['abort_if_any_node_unavailable'] = False

        # get the resource availability and filter out unavailable nodes.
        d = get_lithium_resource_availability(**args)
        d = {k: v for (k, v) in d.iteritems() if v is not None}

        g = get_lithium_nodes()

        # assignments to each of the registered jobs
        run_cfgs = []
        for x in self.jobs:
            if x['require_nodes'] is not None:
                req_nodes = x['require_nodes']
            else:
                req_nodes = d.keys()

            # based on the gpu type restriction.
            if x['require_gpu_types'] is not None:
                req_gpu_nodes = tb_ut.flatten(
                    tb_ut.subset_dict_via_selection(g, x['require_gpu_types']))
            else:
                # NOTE: only consider the nodes that are available anyway.
                req_gpu_nodes = d.keys()

            # potentially available nodes to place this job.
            nodes = list(set(req_nodes).intersection(req_gpu_nodes))
            assert len(nodes) > 0

            # greedy assigned to a node.
            assigned = False
            for n in nodes:
                r = d[n]
                # if there are enough resources on the node, assign it to the
                # job.
                if ((r['cpus_free'] >= x['num_cpus'])
                        and (r['gpus_free'] >= x['num_gpus']) and
                    (r['mem_mbs_free'] >= tb_rs.convert_between_byte_units(
                        x['mem_budget'],
                        src_units=x['mem_units'],
                        dst_units='megabytes'))):

                    # record information about where to run the job.
                    run_cfgs.append({
                        'node':
                        n,
                        'visible_gpu_ids':
                        r['free_gpu_ids'][:x['num_gpus']]
                    })

                    # deduct the allocated resources from the available resources
                    # for that node.
                    r['cpus_free'] -= x['num_cpus']
                    r['gpus_free'] -= x['num_gpus']
                    r['mem_mbs_free'] -= tb_rs.convert_between_byte_units(
                        x['mem_budget'],
                        src_units=x['mem_units'],
                        dst_units='megabytes')
                    r['free_gpu_ids'] = r['free_gpu_ids'][x['num_gpus']:]
                    # assigned = True
                    break

            # if not assigned, terminate without doing anything.
            if not assigned:
                run_cfgs.append(None)
                if run_only_if_enough_resources_for_all:
                    print("Insufficient resources to satisfy"
                          " (cpus=%d, gpus=%d, mem=%0.3f%s)" %
                          (x['num_cpus'], x['num_gpus'], x['mem_budget'],
                           x['mem_units']))
                    return None

        # running the jobs that have a valid config.
        remaining_jobs = []
        outs = []
        for x, c in zip(self.jobs, run_cfgs):
            if c is None:
                remaining_jobs.append(x)
            else:
                out = run_on_lithium_node(**tb_ut.merge_dicts([
                    tb_ut.subset_dict_via_selection(vars(
                        self), ['servername', 'username', 'password']),
                    tb_ut.subset_dict_via_selection(x, [
                        'bash_command', 'folderpath', 'wait_for_output',
                        'run_on_head_node'
                    ]),
                    tb_ut.subset_dict_via_selection(
                        c, ['node', 'visible_gpu_ids'])
                ]))
                outs.append(out)

        self.jobs = remaining_jobs
        return outs
Beispiel #6
0
def memory_process(pid, units='megabytes'):
    psutil_p = psutil.Process(pid)
    mem_p = psutil_p.memory_info()[0]
    return tb_rs.convert_between_byte_units(mem_p, dst_units=units)
Beispiel #7
0
 def memory_max(self, units='megabytes'):
     return tb_rs.convert_between_byte_units(self.max_registered,
                                             dst_units=units)
Beispiel #8
0
    def memory_total(self, units='megabytes'):
        mem_now = memory_process(os.getpid(), units)
        if self.max_registered < mem_now:
            self.max_registered = mem_now

        return tb_rs.convert_between_byte_units(mem_now, dst_units=units)