def remote(cls, host, force=False): """ TODO: there is a bug in the instalation of kilo the openrc file on the remote machine is not called openrc.sh but contains username and project number. :param host: the remote host :param force: :return: """ config = ConfigDict("cloudmesh.yaml") host_spec = config["cloudmesh.clouds." + host] host_credentials = host_spec["credentials"] if 'cm_openrc' in host_spec: Console.ok("looking for openrc") else: Console.error("no cm_openrc specified in the host") return hostname = config["cloudmesh.clouds." + host + ".cm_host"] Console.ok("fetching information from {:} ...".format(host)) openrc = host_spec["cm_openrc"] directory = os.path.dirname(openrc) base = os.path.basename(openrc) _from_dir = "{:}:{:}".format(hostname, directory + "/*").replace("~/", "") # _to_dir = os.path.dirname(Config.path_expand(directory)) # FIX: Issues with path expanding on Windows _to_dir = os.path.realpath( os.path.expanduser(directory) ) ''' In Windows, SCP fails with path such as C:\\Users\\..., and passes with '~/.cloudmesh/...' But on Linux machines, it fails with ~/.cloudmesh/... and passes with /home/user/... Hence, adding OS check below for SCP copy directory ''' os_type = platform.system().lower() if 'windows' not in os_type: directory = _to_dir # FIX: fix for scp not working on Windows, because scp does not # understand # paths in format: "C:/Users/<>", rather expects "~/.cloudmesh/<>" # openrc_file = Config.path_expand(openrc) openrc_file = os.path.realpath( os.path.expanduser(openrc) ) print("From: ", _from_dir) print("To: ", _to_dir) print("Openrc:", openrc_file) cls.make_dir(_to_dir) r = "" Console.ok("Reading rc file from {}".format(host)) try: r = Shell.scp('-r', _from_dir, directory) except Exception as e: print(e) return # # TODO: the permission are not yet right # os.chmod(_to_dir, 0o700) for root, dirs, _ in os.walk(_to_dir): for d in dirs: os.chmod(os.path.join(root, d), 0o700) # # END PERMISSION # with open(openrc_file, 'r') as f: lines = f.read().split("\n") config = ConfigDict("cloudmesh.yaml") for line in lines: if line.strip().startswith("export"): line = line.replace("export ", "") key, value = line.split("=", 1) config["cloudmesh"]["clouds"][host]["credentials"][key] = value host_spec = config["cloudmesh"]["clouds"][host] credentials = host_spec["credentials"] if "cm_openrc" in host_spec: openrc = host_spec["cm_openrc"] for attribute in credentials: if attribute in openrc: openrc.replace(attribute, credentials[attribute]) config.save() config = ConfigDict("cloudmesh.yaml") return config["cloudmesh"]["clouds"][host]["credentials"]
def run(cls, cluster, group, cmd, **kwargs): # determine the script name.. # # TODO: script count is variable in data base, we test if fil exists and if it # does increase counter till we find one that does not, that will be new counter. # new counter will than be placed in db. # # define get_script_name(directory, prefix, counter) # there maybe s a similar thing already in the old cloudmesh # # if not kwargs['-name']: # # old_count = Shell.ssh(cluster, # "ls {}*.sh | wc -l | sed 's/$/ count/'". # format(username)) # c = [f for f in old_count.splitlines() if 'count' in f] # script_count = c[0].split()[0] # else: # script_count = kwargs['-name'] config = cls.read_config(cluster) if config["credentials"]["username"] == 'TBD': return "Please enter username in cloudmesh.yaml for cluster {}".format(cluster) cls.incr() data = { "cluster": cluster, "count": cls.counter(), "username": config["credentials"]["username"], "remote_experiment_dir": config["default"]["experiment_dir"], "queue": config["default"]["queue"], "id": None, "nodes": 1, "tasks_per_node": 1, } data["script_base_name"] = "{username}-{count}".format(**data) data["script_name"] = "{username}-{count}.sh".format(**data) data["script_output"] = "{username}-{count}.out".format(**data) data["script_error"] = "{username}-{count}.err".format(**data) data["remote_experiment_dir"] = \ "{remote_experiment_dir}/{count}".format(**data).format(**data) data["group"] = group # overwrite defaults option_mapping = {'-t': '{tasks_per_node}'.format(**data), '-N': '{nodes}'.format(**data), '-p': '{queue}'.format(**data), '-o': '{script_output}'.format(**data), '-D': '{remote_experiment_dir}'.format(**data), '-e': '{script_error}'.format(**data)} # map(lambda k, v: # option_mapping.__setitem__(k, kwargs.get(k) or v), # option_mapping.items()) # # rewrite for better readability for (k, v) in iteritems(option_mapping): option_mapping[k] = kwargs.get(k) or v config = cls.read_config(cluster) project = None try: project = config["credentials"]["project"] if project.lower() not in ["tbd", "none"]: option_mapping["-A"] = project except: pass for key in option_mapping: data[key] = option_mapping[key] # create the options for the script options = "" for key, value in option_mapping.items(): options += '#SBATCH {} {}\n'.format(key, value) cls.create_remote_dir(cluster, data["remote_experiment_dir"]) # if the command is a script, copy the script if os.path.isfile(Config.path_expand(cmd)): _from = Config.path_expand(cmd) _to = '{cluster}:{remote_experiment_dir}'.format(**data) local_file_name = cmd.split('/')[-1] Shell.execute("rsync", [_from, _to]) data["command"] = '{remote_experiment_dir}/{local_file_name}'.format(local_file_name=local_file_name, **data) else: data["command"] = cmd data["options"] = options script = textwrap.dedent( """ #! /bin/sh {options} echo '#CLOUDMESH: BATCH ENVIRONMENT' echo 'BASIL_RESERVATION_ID:' $BASIL_RESERVATION_ID echo 'SLURM_CPU_BIND:' $SLURM_CPU_BIND echo 'SLURM_JOB_ID:' $SLURM_JOB_ID echo 'SLURM_JOB_CPUS_PER_NODE:' $SLURM_JOB_CPUS_PER_NODE echo 'SLURM_JOB_DEPENDENCY:' $SLURM_JOB_DEPENDENCY echo 'SLURM_JOB_NAME:' $SLURM_JOB_NAME echo 'SLURM_JOB_NODELIST:' $SLURM_JOB_NODELIST echo 'SLURM_JOB_NUM_NODES:' $SLURM_JOB_NUM_NODES echo 'SLURM_MEM_BIND:' $SLURM_MEM_BIND echo 'SLURM_TASKS_PER_NODE:' $SLURM_TASKS_PER_NODE echo 'MPIRUN_NOALLOCATE:' $MPIRUN_NOALLOCATE echo 'MPIRUN_NOFREE:' $MPIRUN_NOFREE echo 'SLURM_NTASKS_PER_CORE:' $SLURM_NTASKS_PER_CORE echo 'SLURM_NTASKS_PER_NODE:' $SLURM_NTASKS_PER_NODE echo 'SLURM_NTASKS_PER_SOCKET:' $SLURM_NTASKS_PER_SOCKET echo 'SLURM_RESTART_COUNT:' $SLURM_RESTART_COUNT echo 'SLURM_SUBMIT_DIR:' $SLURM_SUBMIT_DIR echo 'MPIRUN_PARTITION:' $MPIRUN_PARTITION d=$(date) echo \"#CLOUDMESH: status, start, $d\" srun -l echo \"#CLOUDMESH: status, start, $d\" srun -l {command} d=$(date) srun -l echo \"#CLOUDMESH: status, finished, $d\" d=$(date) echo \"#CLOUDMESH: status, finished, $d\" """ ).format(**data).replace("\r\n", "\n").strip() _from = Config.path_expand('~/.cloudmesh/{script_name}'.format(**data)) _to = '{cluster}:{remote_experiment_dir}'.format(**data) data["from"] = _from data["to"] = _to data["script"] = script # write the script to local # print(_from) # print(_to) with open(_from, 'w') as local_file: local_file.write(script) # copy to remote host Shell.scp(_from, _to) # delete local file # Shell.execute('rm', _from) # import sys; sys.exit() # run the sbatch command cmd = 'sbatch {remote_experiment_dir}/{script_name}'.format(**data) data["cmd"] = cmd # print ("CMD>", cmd) result = Shell.ssh(cluster, cmd) data["output"] = result # find id for line in result.split("\n"): # print ("LLL>", line) if "Submitted batch job" in line: data["job_id"] = int(line.replace("Submitted batch job ", "").strip()) break # # HACK, should not depend on Model.py # # from cloudmesh_client.db.model import BATCHJOB # name = "" # BATCHJOB(name, # cluster=data["cluster"], # id=data["id"], # script=data["script"]) # has user and username which seems wrong # here what we have in data and want to store the - options are obviously wrong # and need to be full names # noinspection PyPep8,PyPep8 """ {'-D': '/N/u/gvonlasz/experiment/3', '-N': '1', '-o': 'gvonlasz-3.out', '-p': 'delta', '-t': '1', 'cluster': 'india', 'cmd': 'sbatch /N/u/gvonlasz/experiment/3/gvonlasz-3.sh', 'command': 'uname', 'count': 3, 'from': '/Users/big/.cloudmesh/gvonlasz-3.sh', 'id': 1346, 'options': '#SBATCH -t 1\n#SBATCH -o gvonlasz-3.out\n#SBATCH -N 1\n#SBATCH -p delta\n#SBATCH -D /N/u/gvonlasz/experiment/3\n', 'output': 'Submitted batch job 1346', 'queue': 'delta', 'remote_experiment_dir': '/N/u/gvonlasz/experiment/3', 'script': "#! /bin/sh\n#SBATCH -t 1\n#SBATCH -o gvonlasz-3.out\n#SBATCH -N 1\n#SBATCH -p delta\n#SBATCH -D /N/u/gvonlasz/experiment/3\n\nsrun -l echo '#CLOUDMESH: Starting'\nsrun -l uname\nsrun -l echo '#CLOUDMESH: Test ok'", 'script_base_name': 'gvonlasz-3', 'script_name': 'gvonlasz-3.sh', 'script_output': 'gvonlasz-3.out', 'to': 'india:/N/u/gvonlasz/experiment/3', 'username': '******'} """ """ we also want to store what part of the .out file, BASIL_RESERVATION_ID: SLURM_CPU_BIND: SLURM_JOB_ID: 1351 SLURM_JOB_CPUS_PER_NODE: 12 SLURM_JOB_DEPENDENCY: SLURM_JOB_NAME: gvonlasz-8.sh SLURM_JOB_NODELIST: d001 SLURM_JOB_NUM_NODES: 1 SLURM_MEM_BIND: SLURM_TASKS_PER_NODE: 12 MPIRUN_NOALLOCATE: MPIRUN_NOFREE: SLURM_NTASKS_PER_CORE: SLURM_NTASKS_PER_NODE: SLURM_NTASKS_PER_SOCKET: SLURM_RESTART_COUNT: SLURM_SUBMIT_DIR: /N/u/gvonlasz MPIRUN_PARTITION: so maybe we want to use some of the names here as they reflect the env vars """ # # add data to database # # remove the - options for key in ['-t', '-N', '-p', '-o', '-D', '-e']: if key in data: print(key, data[key]) del data[key] data['status'] = 'started' cls.add_db(**data) return data
def remote(cls, host, force=False): """ TODO: there is a bug in the instalation of kilo the openrc file on the remote machine is not called openrc.sh but contains username and project number. :param host: the remote host :param force: :return: """ config = ConfigDict("cloudmesh.yaml") host_spec = config["cloudmesh.clouds." + host] host_credentials = host_spec["credentials"] if 'cm_openrc' in host_spec: Console.ok("looking for openrc") else: Console.error("no cm_openrc specified in the host") return hostname = config["cloudmesh.clouds." + host + ".cm_host"] Console.ok("fetching information from {:} ...".format(host)) openrc = host_spec["cm_openrc"] directory = os.path.dirname(openrc) base = os.path.basename(openrc) _from_dir = "{:}:{:}".format(hostname, directory + "/*").replace("~/", "") # _to_dir = os.path.dirname(Config.path_expand(directory)) # FIX: Issues with path expanding on Windows _to_dir = os.path.realpath(os.path.expanduser(directory)) ''' In Windows, SCP fails with path such as C:\\Users\\..., and passes with '~/.cloudmesh/...' But on Linux machines, it fails with ~/.cloudmesh/... and passes with /home/user/... Hence, adding OS check below for SCP copy directory ''' os_type = platform.system().lower() if 'windows' not in os_type: directory = _to_dir # FIX: fix for scp not working on Windows, because scp does not # understand # paths in format: "C:/Users/<>", rather expects "~/.cloudmesh/<>" # openrc_file = Config.path_expand(openrc) openrc_file = os.path.realpath(os.path.expanduser(openrc)) print("From: ", _from_dir) print("To: ", _to_dir) print("Openrc:", openrc_file) cls.make_dir(_to_dir) r = "" Console.ok("Reading rc file from {}".format(host)) try: r = Shell.scp('-r', _from_dir, directory) except Exception as e: print(e) return # # TODO: the permission are not yet right # os.chmod(_to_dir, 0o700) for root, dirs, _ in os.walk(_to_dir): for d in dirs: os.chmod(os.path.join(root, d), 0o700) # # END PERMISSION # with open(openrc_file, 'r') as f: lines = f.read().split("\n") config = ConfigDict("cloudmesh.yaml") for line in lines: if line.strip().startswith("export"): line = line.replace("export ", "") key, value = line.split("=", 1) config["cloudmesh"]["clouds"][host]["credentials"][key] = value host_spec = config["cloudmesh"]["clouds"][host] credentials = host_spec["credentials"] if "cm_openrc" in host_spec: openrc = host_spec["cm_openrc"] for attribute in credentials: if attribute in openrc: openrc.replace(attribute, credentials[attribute]) config.save() config = ConfigDict("cloudmesh.yaml") return config["cloudmesh"]["clouds"][host]["credentials"]