예제 #1
0
    def remote(cls, host, force=False):
        """

        TODO: there is a bug in the instalation of kilo the openrc file on
        the remote machine is not called openrc.sh but contains username and
        project number.

        :param host: the remote host
        :param force:
        :return:
        """

        config = ConfigDict("cloudmesh.yaml")

        host_spec = config["cloudmesh.clouds." + host]
        host_credentials = host_spec["credentials"]

        if 'cm_openrc' in host_spec:
            Console.ok("looking for openrc")
        else:
            Console.error("no cm_openrc specified in the host")
            return

        hostname = config["cloudmesh.clouds." + host + ".cm_host"]
        Console.ok("fetching information from {:}  ...".format(host))

        openrc = host_spec["cm_openrc"]

        directory = os.path.dirname(openrc)
        base = os.path.basename(openrc)

        _from_dir = "{:}:{:}".format(hostname, directory + "/*").replace("~/", "")
        # _to_dir = os.path.dirname(Config.path_expand(directory))
        # FIX: Issues with path expanding on Windows
        _to_dir = os.path.realpath(
            os.path.expanduser(directory)
        )

        '''
        In Windows, SCP fails with path such as C:\\Users\\...,
            and passes with '~/.cloudmesh/...'
        But on Linux machines, it fails with ~/.cloudmesh/...
            and passes with /home/user/...
        Hence, adding OS check below for SCP copy directory
        '''
        os_type = platform.system().lower()
        if 'windows' not in os_type:
            directory = _to_dir

        # FIX: fix for scp not working on Windows, because scp does not
        # understand
        # paths in format: "C:/Users/<>", rather expects "~/.cloudmesh/<>"
        # openrc_file = Config.path_expand(openrc)
        openrc_file = os.path.realpath(
            os.path.expanduser(openrc)
        )
        print("From:  ", _from_dir)
        print("To:    ", _to_dir)
        print("Openrc:", openrc_file)

        cls.make_dir(_to_dir)
        r = ""
        Console.ok("Reading rc file from {}".format(host))
        try:
            r = Shell.scp('-r', _from_dir, directory)
        except Exception as e:
            print(e)
            return

        #
        # TODO: the permission are not yet right
        #
        os.chmod(_to_dir, 0o700)
        for root, dirs, _ in os.walk(_to_dir):
            for d in dirs:
                os.chmod(os.path.join(root, d), 0o700)
        #
        # END PERMISSION
        #

        with open(openrc_file, 'r') as f:
            lines = f.read().split("\n")

        config = ConfigDict("cloudmesh.yaml")
        for line in lines:
            if line.strip().startswith("export"):
                line = line.replace("export ", "")
                key, value = line.split("=", 1)
                config["cloudmesh"]["clouds"][host]["credentials"][key] = value
        host_spec = config["cloudmesh"]["clouds"][host]
        credentials = host_spec["credentials"]

        if "cm_openrc" in host_spec:
            openrc = host_spec["cm_openrc"]
            for attribute in credentials:
                if attribute in openrc:
                    openrc.replace(attribute, credentials[attribute])
        config.save()
        config = ConfigDict("cloudmesh.yaml")
        return config["cloudmesh"]["clouds"][host]["credentials"]
예제 #2
0
    def run(cls, cluster, group, cmd, **kwargs):

        # determine the script name..

        #
        # TODO: script count is variable in data base, we test if fil exists and if it
        # does increase counter till we find one that does not, that will be new counter.
        # new counter will than be placed in db.
        #
        # define get_script_name(directory, prefix, counter)
        # there maybe s a similar thing already in the old cloudmesh
        #

        # if not kwargs['-name']:
        #
        #    old_count = Shell.ssh(cluster,
        #                          "ls {}*.sh | wc -l | sed 's/$/ count/'".
        #                          format(username))
        #    c = [f for f in old_count.splitlines() if 'count' in f]
        #    script_count = c[0].split()[0]
        # else:
        #    script_count = kwargs['-name']

        config = cls.read_config(cluster)
        if config["credentials"]["username"] == 'TBD':
            return "Please enter username in cloudmesh.yaml for cluster {}".format(cluster)

        cls.incr()
        data = {
            "cluster": cluster,
            "count": cls.counter(),
            "username": config["credentials"]["username"],
            "remote_experiment_dir": config["default"]["experiment_dir"],
            "queue": config["default"]["queue"],
            "id": None,
            "nodes": 1,
            "tasks_per_node": 1,
        }
        data["script_base_name"] = "{username}-{count}".format(**data)
        data["script_name"] = "{username}-{count}.sh".format(**data)
        data["script_output"] = "{username}-{count}.out".format(**data)
        data["script_error"] = "{username}-{count}.err".format(**data)
        data["remote_experiment_dir"] = \
            "{remote_experiment_dir}/{count}".format(**data).format(**data)
        data["group"] = group

        # overwrite defaults
        option_mapping = {'-t': '{tasks_per_node}'.format(**data),
                          '-N': '{nodes}'.format(**data),
                          '-p': '{queue}'.format(**data),
                          '-o': '{script_output}'.format(**data),
                          '-D': '{remote_experiment_dir}'.format(**data),
                          '-e': '{script_error}'.format(**data)}

        # map(lambda k, v:
        #    option_mapping.__setitem__(k, kwargs.get(k) or v),
        #    option_mapping.items())
        #
        # rewrite for better readability
        for (k, v) in iteritems(option_mapping):
            option_mapping[k] = kwargs.get(k) or v

        config = cls.read_config(cluster)
        project = None
        try:
            project = config["credentials"]["project"]
            if project.lower() not in ["tbd", "none"]:
                option_mapping["-A"] = project
        except:
            pass

        for key in option_mapping:
            data[key] = option_mapping[key]

        # create the options for the script
        options = ""
        for key, value in option_mapping.items():
            options += '#SBATCH {} {}\n'.format(key, value)

        cls.create_remote_dir(cluster, data["remote_experiment_dir"])

        # if the command is a script, copy the script
        if os.path.isfile(Config.path_expand(cmd)):
            _from = Config.path_expand(cmd)
            _to = '{cluster}:{remote_experiment_dir}'.format(**data)

            local_file_name = cmd.split('/')[-1]
            Shell.execute("rsync", [_from, _to])
            data["command"] = '{remote_experiment_dir}/{local_file_name}'.format(local_file_name=local_file_name,
                                                                                 **data)
        else:
            data["command"] = cmd

        data["options"] = options

        script = textwrap.dedent(
            """
            #! /bin/sh
            {options}

            echo '#CLOUDMESH: BATCH ENVIRONMENT'
            echo 'BASIL_RESERVATION_ID:' $BASIL_RESERVATION_ID
            echo 'SLURM_CPU_BIND:' $SLURM_CPU_BIND
            echo 'SLURM_JOB_ID:' $SLURM_JOB_ID
            echo 'SLURM_JOB_CPUS_PER_NODE:' $SLURM_JOB_CPUS_PER_NODE
            echo 'SLURM_JOB_DEPENDENCY:' $SLURM_JOB_DEPENDENCY
            echo 'SLURM_JOB_NAME:' $SLURM_JOB_NAME
            echo 'SLURM_JOB_NODELIST:' $SLURM_JOB_NODELIST
            echo 'SLURM_JOB_NUM_NODES:' $SLURM_JOB_NUM_NODES
            echo 'SLURM_MEM_BIND:' $SLURM_MEM_BIND
            echo 'SLURM_TASKS_PER_NODE:' $SLURM_TASKS_PER_NODE
            echo 'MPIRUN_NOALLOCATE:' $MPIRUN_NOALLOCATE
            echo 'MPIRUN_NOFREE:' $MPIRUN_NOFREE
            echo 'SLURM_NTASKS_PER_CORE:' $SLURM_NTASKS_PER_CORE
            echo 'SLURM_NTASKS_PER_NODE:' $SLURM_NTASKS_PER_NODE
            echo 'SLURM_NTASKS_PER_SOCKET:' $SLURM_NTASKS_PER_SOCKET
            echo 'SLURM_RESTART_COUNT:' $SLURM_RESTART_COUNT
            echo 'SLURM_SUBMIT_DIR:' $SLURM_SUBMIT_DIR
            echo 'MPIRUN_PARTITION:' $MPIRUN_PARTITION
            d=$(date)
            echo \"#CLOUDMESH: status, start, $d\"
            srun -l echo \"#CLOUDMESH: status, start, $d\"
            srun -l {command}
            d=$(date)
            srun -l echo \"#CLOUDMESH: status, finished, $d\"
            d=$(date)
            echo \"#CLOUDMESH: status, finished, $d\"
            """
        ).format(**data).replace("\r\n", "\n").strip()

        _from = Config.path_expand('~/.cloudmesh/{script_name}'.format(**data))
        _to = '{cluster}:{remote_experiment_dir}'.format(**data)
        data["from"] = _from
        data["to"] = _to
        data["script"] = script
        # write the script to local

        # print(_from)
        # print(_to)

        with open(_from, 'w') as local_file:
            local_file.write(script)

        # copy to remote host
        Shell.scp(_from, _to)

        # delete local file
        # Shell.execute('rm', _from)
        # import sys; sys.exit()

        # run the sbatch command

        cmd = 'sbatch {remote_experiment_dir}/{script_name}'.format(**data)
        data["cmd"] = cmd

        # print ("CMD>", cmd)
        result = Shell.ssh(cluster, cmd)

        data["output"] = result

        # find id
        for line in result.split("\n"):
            # print ("LLL>", line)
            if "Submitted batch job" in line:
                data["job_id"] = int(line.replace("Submitted batch job ", "").strip())
                break

        #
        # HACK, should not depend on Model.py
        #

        # from cloudmesh_client.db.model import BATCHJOB
        # name = ""
        # BATCHJOB(name,
        #         cluster=data["cluster"],
        #         id=data["id"],
        #         script=data["script"]) # has user and username which seems wrong

        # here what we have in data and want to store the - options are obviously wrong
        # and need to be full names
        # noinspection PyPep8,PyPep8
        """
                {'-D': '/N/u/gvonlasz/experiment/3',
                 '-N': '1',
                 '-o': 'gvonlasz-3.out',
                 '-p': 'delta',
                 '-t': '1',
                 'cluster': 'india',
                 'cmd': 'sbatch /N/u/gvonlasz/experiment/3/gvonlasz-3.sh',
                 'command': 'uname',
                 'count': 3,
                 'from': '/Users/big/.cloudmesh/gvonlasz-3.sh',
                 'id': 1346,
                 'options': '#SBATCH -t 1\n#SBATCH -o gvonlasz-3.out\n#SBATCH -N 1\n#SBATCH -p delta\n#SBATCH -D /N/u/gvonlasz/experiment/3\n',
                 'output': 'Submitted batch job 1346',
                 'queue': 'delta',
                 'remote_experiment_dir': '/N/u/gvonlasz/experiment/3',
                 'script': "#! /bin/sh\n#SBATCH -t 1\n#SBATCH -o gvonlasz-3.out\n#SBATCH -N 1\n#SBATCH -p delta\n#SBATCH -D /N/u/gvonlasz/experiment/3\n\nsrun -l echo '#CLOUDMESH: Starting'\nsrun -l uname\nsrun -l echo '#CLOUDMESH: Test ok'",
                 'script_base_name': 'gvonlasz-3',
                 'script_name': 'gvonlasz-3.sh',
                 'script_output': 'gvonlasz-3.out',
                 'to': 'india:/N/u/gvonlasz/experiment/3',
                 'username': '******'}
                """

        """
        we also want to store what part of the .out file,

        BASIL_RESERVATION_ID:
        SLURM_CPU_BIND:
        SLURM_JOB_ID: 1351
        SLURM_JOB_CPUS_PER_NODE: 12
        SLURM_JOB_DEPENDENCY:
        SLURM_JOB_NAME: gvonlasz-8.sh
        SLURM_JOB_NODELIST: d001
        SLURM_JOB_NUM_NODES: 1
        SLURM_MEM_BIND:
        SLURM_TASKS_PER_NODE: 12
        MPIRUN_NOALLOCATE:
        MPIRUN_NOFREE:
        SLURM_NTASKS_PER_CORE:
        SLURM_NTASKS_PER_NODE:
        SLURM_NTASKS_PER_SOCKET:
        SLURM_RESTART_COUNT:
        SLURM_SUBMIT_DIR: /N/u/gvonlasz
        MPIRUN_PARTITION:

        so maybe we want to use some of the names here as they reflect the env vars
        """

        #
        # add data to database
        #
        # remove the - options

        for key in ['-t', '-N', '-p', '-o', '-D', '-e']:
            if key in data:
                print(key, data[key])
                del data[key]
        data['status'] = 'started'
        cls.add_db(**data)

        return data
예제 #3
0
    def remote(cls, host, force=False):
        """

        TODO: there is a bug in the instalation of kilo the openrc file on
        the remote machine is not called openrc.sh but contains username and
        project number.

        :param host: the remote host
        :param force:
        :return:
        """

        config = ConfigDict("cloudmesh.yaml")

        host_spec = config["cloudmesh.clouds." + host]
        host_credentials = host_spec["credentials"]

        if 'cm_openrc' in host_spec:
            Console.ok("looking for openrc")
        else:
            Console.error("no cm_openrc specified in the host")
            return

        hostname = config["cloudmesh.clouds." + host + ".cm_host"]
        Console.ok("fetching information from {:}  ...".format(host))

        openrc = host_spec["cm_openrc"]

        directory = os.path.dirname(openrc)
        base = os.path.basename(openrc)

        _from_dir = "{:}:{:}".format(hostname,
                                     directory + "/*").replace("~/", "")
        # _to_dir = os.path.dirname(Config.path_expand(directory))
        # FIX: Issues with path expanding on Windows
        _to_dir = os.path.realpath(os.path.expanduser(directory))
        '''
        In Windows, SCP fails with path such as C:\\Users\\...,
            and passes with '~/.cloudmesh/...'
        But on Linux machines, it fails with ~/.cloudmesh/...
            and passes with /home/user/...
        Hence, adding OS check below for SCP copy directory
        '''
        os_type = platform.system().lower()
        if 'windows' not in os_type:
            directory = _to_dir

        # FIX: fix for scp not working on Windows, because scp does not
        # understand
        # paths in format: "C:/Users/<>", rather expects "~/.cloudmesh/<>"
        # openrc_file = Config.path_expand(openrc)
        openrc_file = os.path.realpath(os.path.expanduser(openrc))
        print("From:  ", _from_dir)
        print("To:    ", _to_dir)
        print("Openrc:", openrc_file)

        cls.make_dir(_to_dir)
        r = ""
        Console.ok("Reading rc file from {}".format(host))
        try:
            r = Shell.scp('-r', _from_dir, directory)
        except Exception as e:
            print(e)
            return

        #
        # TODO: the permission are not yet right
        #
        os.chmod(_to_dir, 0o700)
        for root, dirs, _ in os.walk(_to_dir):
            for d in dirs:
                os.chmod(os.path.join(root, d), 0o700)
        #
        # END PERMISSION
        #

        with open(openrc_file, 'r') as f:
            lines = f.read().split("\n")

        config = ConfigDict("cloudmesh.yaml")
        for line in lines:
            if line.strip().startswith("export"):
                line = line.replace("export ", "")
                key, value = line.split("=", 1)
                config["cloudmesh"]["clouds"][host]["credentials"][key] = value
        host_spec = config["cloudmesh"]["clouds"][host]
        credentials = host_spec["credentials"]

        if "cm_openrc" in host_spec:
            openrc = host_spec["cm_openrc"]
            for attribute in credentials:
                if attribute in openrc:
                    openrc.replace(attribute, credentials[attribute])
        config.save()
        config = ConfigDict("cloudmesh.yaml")
        return config["cloudmesh"]["clouds"][host]["credentials"]