예제 #1
0
파일: Batch.py 프로젝트: juaco77/cm
    def __init__(self, debug):
        """
        Initializes the SlurmCluster class

        :param debug: switch the debug information on and off
        """
        current_path = os.path.dirname(os.path.realpath(__file__))
        self.workspace = os.path.join(current_path,
                                      "batch_workspace/slurm_batch.yaml")
        if not os.path.exists(os.path.dirname(self.workspace)):
            os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        self.batch_config = GenericConfig(self.workspace)
        self.debug = debug
        self.all_jobIDs = []
        self.slurm_cluster = {}
        self.job_metadata = {}
예제 #2
0
    def __init__(self, debug):
        """
        Initializes the virtualcluster class

        :param debug: switch the debug information on and off
        """
        current_path = os.path.dirname(os.path.realpath(__file__))
        self.workspace = os.path.join(current_path,
                                      "vcluster_workspace/vcluster.yaml")
        if not os.path.exists(os.path.dirname(self.workspace)):
            os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        self.vcluster_config = GenericConfig(self.workspace)
        self.debug = debug
        self.all_pids = []
        self.virt_cluster = {}
        self.runtime_config = {}
        self.job_metadata = {}
예제 #3
0
class VirtualCluster(object):
    def __init__(self, debug):
        """
        Initializes the virtualcluster class

        :param debug: switch the debug information on and off
        """
        current_path = os.path.dirname(os.path.realpath(__file__))
        self.workspace = os.path.join(current_path,
                                      "vcluster_workspace/vcluster.yaml")
        if not os.path.exists(os.path.dirname(self.workspace)):
            os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        self.vcluster_config = GenericConfig(self.workspace)
        self.debug = debug
        self.all_pids = []
        self.virt_cluster = {}
        self.runtime_config = {}
        self.job_metadata = {}

    def _config_validator(self):
        """
        validates the configuration of a run based on the information about its virtual cluster, runtime configuration
        and the job metadata

        :return:
        """
        job_metadata = self.job_metadata
        virt_cluster = self.virt_cluster
        runtime_config = self.runtime_config
        for node in virt_cluster:
            if 'name' not in virt_cluster[node].keys():
                raise ValueError(
                    " node {}: 'name' keyword, indicating hostname is missing from"
                    .format(node))
            if 'sshconfigpath' not in virt_cluster[node]['credentials'].keys():
                raise ValueError("%s: 'sshconfigpath' keyword is missing" %
                                 node)
            if not os.path.isfile(
                    os.path.expanduser(
                        virt_cluster[node]['credentials']['sshconfigpath'])):
                raise ValueError("%s: The ssh config file %s does not exists" % (node, virt_cluster[node] \
                    ['credentials']['sshconfigpath']))
        if not os.path.isfile(os.path.expanduser(job_metadata['script_path'])):
            raise ValueError("The script file %s does not exists" %
                             (job_metadata['script_path']))
        if runtime_config['input-type'] == 'params+file':
            if not os.path.isfile(
                    os.path.expanduser(job_metadata['argfile_path'])):
                raise ValueError("The arg file %s does not exists" %
                                 (job_metadata['arg_file_path']))

    def _clean_remote_in_parallel(self, target_node, remote_path):
        """
        This method is used to spawn processes to clean the remotes of a particular job.

        :param target_node: the node on which the data is going to be removed
        :param remote_path: path of the data to be removed
        :return:
        """
        target_node_info = self.virt_cluster[target_node]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \
                                                                                           ['sshconfigpath']), *x)
        ssh_caller('rm -rf {}'.format(remote_path))
        if len(ssh_caller('ls {}'.format(remote_path))) == 0:
            print("Node {} cleaned successfully.".format(target_node))
        else:
            print("Error: Node {} could not be cleaned.".format(target_node))

    def _connection_test_in_parallel(self, target_node):
        """
        This method is used to test the connection to cluster nodes in parallel

        :param target_node: the node to which the connection is going to be tested
        :return:
        """
        target_node_info = self.virt_cluster[target_node]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \
                                                                                           ['sshconfigpath']), *x)
        if len(ssh_caller('uname -a')) > 0:
            print("Node {} is accessible.".format(target_node))
        else:
            print("Error: Node {} cannot be accessed.".format(target_node))

    def _create_config(self, config_name, proc_num, download_proc_num,
                       download_later, input_type, output_type):
        """
        This method is used to create a runtime-configuration.

        :param config_name: name of the runtime configuration
        :param proc_num: number of processes to be spawned in that runtime for submitting the jobs
        :param download_proc_num: number number of processes to be spawned in that runtime for fetching the results
        :param download_later: a flag indicating whether or not the script should wait for the results after the scripts are submitted
        :param input_type: type of the input of the script to be run remotely
        :param output_type: type of the output of the script to be run remotely
        :return:
        """
        config_tosave = {config_name: {}}
        config_tosave[config_name].update({
            "proc_num": proc_num,
            "download_proc_num": download_proc_num,
            "download-later": download_later,
            "input-type": input_type,
            "output-type": output_type
        })
        self.vcluster_config.deep_set(['runtime-config'], config_tosave)
        print("Runtime-configuration created/replaced successfully.")

    def _create_vcluster(self,
                         vcluster_name,
                         cluster_list=(),
                         computer_list=()):
        """
        This method is used to create a virutal cluster

        :param vcluster_name: name of the virtual cluster
        :param cluster_list: list of the clusters to be added to the virutal cluster
        :param computer_list: list of the computers to be used from the previous parameter (cluster_list). If the computer_list is left empty, all of the computers will be used
        :return:
        """
        vcluster_tosave = {vcluster_name: {}}
        for cluster in cluster_list:
            for computer in self.cm_config.get('cluster.{}'.format(cluster)):
                if computer in computer_list or computer_list == '':
                    vcluster_tosave[vcluster_name].update({
                        computer:
                        dict(
                            self.cm_config.get('cluster.{}.{}'.format(
                                cluster, computer)))
                    })
        self.vcluster_config.deep_set(['virtual-cluster'], vcluster_tosave)
        print("Virtual cluster created/replaced successfully.")

    @staticmethod
    def _execute_in_parallel(func_args):
        """
        This is a method used for running methods in parallel

        :param func_args:
        :return:
        """
        target_class = func_args[0]
        method_to_call = getattr(target_class, func_args[1])
        args = list(func_args[2:])
        return method_to_call(*args)

    def _fetch_results_in_parallel(self, job_metadata, node_pid_tuple,
                                   all_pids):
        """
        This method is used to fetch the results from remote nodes.

        :param job_metadata: the dictionary containing the information about the previously submitted job
        :param node_pid_tuple: the tuple containing destination node, destination pid and destination node index when the job was submitted
        :param all_pids:
        :return:
        """
        dest_node = node_pid_tuple[0]
        dest_pid = node_pid_tuple[1]
        node_idx = node_pid_tuple[2]
        dest_node_info = self.virt_cluster[dest_node]
        ssh_caller = lambda *x: self._ssh(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \
                                                                                         ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \
                                                                                         ['sshconfigpath']), *x)
        ps_output = ssh_caller('ps', '-ef', '|', 'grep', dest_pid.strip('\n'),
                               '|', 'grep -v grep')
        if len(ps_output) == 0 and node_pid_tuple in [pid for pid in all_pids]:
            if not os.path.exists(job_metadata['local_path']):
                os.makedirs(job_metadata['local_path'])
            if self.runtime_config['output-type'] == 'stdout':
                scp_caller(
                    '%s:%s' %
                    (dest_node_info['name'],
                     os.path.join(
                         job_metadata['remote_path'],
                         self.add_suffix_to_path('outputfile_%d' % node_idx,
                                                 job_metadata['suffix']))),
                    os.path.join(job_metadata['local_path'], ''))
            elif self.runtime_config['output-type'] in ['file', 'stdout+file']:
                nested_remote_path = os.path.join(job_metadata['remote_path'],
                                                  'run{}'.format(node_idx))
                scp_caller('-r', '%s:%s' % (dest_node_info['name'], nested_remote_path), os.path.join(job_metadata \
                                                                                                          [
                                                                                                          'local_path'],
                                                                                                      ''))
            all_pids.remove((dest_node, dest_pid, node_idx))
            print("Results collected from %s." % dest_node)

    def _run_remote_job_in_parallel(self, job_metadata, param_idx, params,
                                    all_pids):
        """
        This method is used to spawn remote processes in parallel

        :param job_metadata: contains the information about the job
        :param param_idx: index of the parameters inputted as argument
        :param params: the parameters inputted as argument for this run
        :param all_pids: the manager used to take all pids of all submitted jobs
        :return:
        """
        available_nodes_num = len(list(self.virt_cluster.keys()))
        target_node_idx = param_idx % available_nodes_num
        target_node_key = list(self.virt_cluster.keys())[target_node_idx]
        target_node = self.virt_cluster[target_node_key]
        remote_pid = []
        ssh_caller = lambda *x: self._ssh(target_node['name'], os.path.expanduser(target_node['credentials'] \
                                                                                      ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(target_node['name'], os.path.expanduser(target_node['credentials'] \
                                                                                      ['sshconfigpath']), *x)

        # directory_check = ssh_caller('if test -d %s; then echo "exist"; fi' % job_metadata['remote_path'])
        # if len(directory_check) == 0:
        ssh_caller(
            'cd %s && mkdir job%s' %
            (job_metadata['raw_remote_path'], job_metadata['suffix']), True)
        if self.runtime_config['output-type'].lower() in [
                'file', 'stdout+file'
        ]:
            ssh_caller("cd {} && mkdir run{}".format(
                job_metadata['remote_path'], param_idx))
            nested_remote_path = os.path.join(
                job_metadata['remote_path'], 'run{}'.format(param_idx),
                job_metadata['script_name_with_suffix'])
            scp_caller(job_metadata['script_path'],
                       '%s:%s' % (target_node['name'], nested_remote_path))
            ssh_caller('chmod +x', nested_remote_path)
            if self.runtime_config['input-type'].lower() == 'params+file':
                scp_caller(
                    job_metadata['argfile_path'],
                    '%s:%s' % (target_node['name'],
                               os.path.join(job_metadata['remote_path'],
                                            'run{}'.format(param_idx),
                                            job_metadata['argfile_name'])))
        else:
            scp_caller(
                job_metadata['script_path'], '%s:%s' %
                (target_node['name'], job_metadata['remote_script_path']))
            ssh_caller('chmod +x', job_metadata['remote_script_path'])
            if self.runtime_config['input-type'].lower() == 'params+file':
                scp_caller(
                    job_metadata['argfile_path'], '%s:%s' %
                    (target_node['name'], job_metadata['remote_path']))

        if self.runtime_config['output-type'].lower() == 'stdout':
            remote_pid = ssh_caller(
                'cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % (job_metadata['remote_path'],
                                                                        job_metadata['remote_script_path'],
                                                                        params,
                                                                        os.path.join(job_metadata['remote_path'],
                                                                                     self.add_suffix_to_path(
                                                                                         'outputfile_%d' % \
                                                                                         param_idx,
                                                                                         job_metadata['suffix']))))
        elif self.runtime_config['output-type'].lower() == 'stdout+file':
            remote_pid = ssh_caller('cd %s && nohup %s %s > %s 2>&1 </dev/null& echo $!' % \
                                    (os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx)),
                                     os.path.join(job_metadata['remote_path'], 'run{}'.format(param_idx),
                                                  job_metadata['script_name_with_suffix']), params, os.path.join(
                                        job_metadata['remote_path'], 'run{}'.format(param_idx),
                                        self.add_suffix_to_path('outputfile_%d' % param_idx, job_metadata['suffix']))))
        elif self.runtime_config['output-type'].lower() == 'file':
            remote_pid = ssh_caller(
                'cd %s && nohup ./%s %s >&- & echo $!' %
                (os.path.join(job_metadata['remote_path'],
                              'run{}'.format(param_idx)),
                 job_metadata['script_name_with_suffix'], params))
        all_pids.append((target_node_key, remote_pid, param_idx))
        print('Remote Pid on %s: %s' %
              (target_node_key, remote_pid.strip('\n')))

    @staticmethod
    def _ssh(hostname, sshconfigpath, *args):
        """
        This method is used to create remove ssh connections

        :param hostname: hostname
        :param sshconfigpath: path to sshconfig for connecting to remote node
        :param args: the argument to be submitted via ssh
        :return:
        """
        hide_errors_flag = False
        if type(args[-1]) == bool:
            hide_errors_flag = True
            args = args[:-1]
        ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        result = ssh.stdout.readline()
        if not result:
            error = ssh.stderr.readlines()
            if len(error) > 0 and hide_errors_flag == False:
                print("ERROR in host %s: %s" % (hostname, error))
            return []
        else:
            try:
                return ''.join([chr(x) for x in result])
            except AttributeError:
                return [result.decode('utf-8').strip('\n')]

    @staticmethod
    def _scp(hostname, sshconfigpath, *args):
        """
        This method is used for scp from and to remote

        :param hostname: hostname
        :param sshconfigpath: ssh config file
        :param args:arguments for using while copying
        :return:
        """
        ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        middle_result = ssh.stdout.readlines()
        if not middle_result:
            error = ssh.stderr.readlines()
            if len(error) > 0:
                print("ERROR in host %s: %s" % (hostname, error))

    @staticmethod
    def add_suffix_to_path(path, suffix):
        """
        This method is used to add suffix to a path

        :param path: path
        :param suffix: suffix
        :return:
        """
        dir_path = os.path.dirname(path)
        full_filename = os.path.basename(path)
        filename, fileextention = os.path.splitext(full_filename)
        full_filename_new = filename + suffix + fileextention
        new_path = os.path.join(dir_path, full_filename_new)
        return new_path

    def clean_remote(self, job_name, proc_num):
        """
        This method is used to spawn processes for cleaning the remote nodes

        :param job_name: name of previously submitted job for which the nodes are going to be cleaned
        :param proc_num: number of processes used for cleaning the remote nodes
        :return:
        """
        job_metadata = self.vcluster_config.get('job-metadata')[job_name]
        self.virt_cluster = self.vcluster_config.get('virtual-cluster')[
            job_metadata['cluster_name']]
        remote_path = job_metadata['remote_path']
        all_jobs = [(self, '_clean_remote_in_parallel', node, remote_path)
                    for node in list(self.virt_cluster)]
        pool = Pool(processes=proc_num)
        pool.map(self._execute_in_parallel, all_jobs)

    def connection_test(self, vcluster_name, proc_num):
        """
        This method is used for spawning processes for testing the connections to remote nodes of a vcluster

        :param vcluster_name: name of the virtual cluster the nodes of which are going to be tested
        :param proc_num: number of processes used for testing the remote nodes
        :return:
        """
        self.virt_cluster = self.vcluster_config.get(
            'virtual-cluster')[vcluster_name]
        all_jobs = [(self, '_connection_test_in_parallel', node)
                    for node in list(self.virt_cluster)]
        pool = Pool(processes=proc_num)
        pool.map(self._execute_in_parallel, all_jobs)

    def create(self, *args, **kwargs):
        """
        This is a caller for creator functions including config creator and vcluster creator

        :param args:
        :param kwargs:
        :return:
        """
        if len(args) > 5:
            self._create_config(*args)
        else:
            self._create_vcluster(*args, **kwargs)

    def destroy(self, target, key):
        """
        Used to remove virtual clusters and runtime configs

        :param target: type of entity to be removed
        :param key: keyname of the entity to be removed
        :return:
        """
        if target == 'virtual-cluster':
            self.vcluster_config.remove(['virtual-cluster'], key)
            print("Virtual-cluster {} destroyed successfully.".format(key))
        elif target == 'runtime-config':
            self.vcluster_config.remove(['runtime-config'], key)
            print(
                "Runtime-configuration {} destroyed successfully.".format(key))
        else:
            raise ValueError("Target of destroying not found.")

    def fetch(self, job_name):
        """
        This method is used to fetch resutls from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.vcluster_config.get('job-metadata')[job_name]
        self.virt_cluster = self.vcluster_config.get('virtual-cluster')[
            job_metadata['cluster_name']]
        self.runtime_config = self.vcluster_config.get('runtime-config')[
            job_metadata['config_name']]
        loaded_all_pids = [tuple(x) for x in job_metadata['nodes-pids']]
        all_pids = Manager().list()
        all_pids.extend(loaded_all_pids)
        pool = Pool(processes=self.runtime_config['download_proc_num'])
        print("collecting results")
        while len(all_pids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, node_pid_tuple, all_pids) for \
                                node_pid_tuple in loaded_all_pids if node_pid_tuple in all_pids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")

    def list(self, target, max_depth, current_depth=1, input_dict=None):
        """
        listing the current virtual clusters based on the vcluster_conf file.

        :param target: name of the virtual cluster to be listed
        :param max_depth: depth of information to be shown
        :param input_dict: used for recursion for depth of higher than 1
        :param current_depth: current depth of printing information
        :return:
        """
        if target == 'virtual-clusters' and input_dict is None:
            input_dict = self.vcluster_config.get('virtual-cluster')
        elif target == 'runtime-configs' and input_dict is None:
            input_dict = self.vcluster_config.get('runtime-config')
        elif input_dict is None:
            raise ValueError("Target of listing not found.")

        if max_depth >= current_depth:
            if type(input_dict) == dict:
                for key in input_dict:
                    key_to_print = key + ':' if max_depth >= current_depth else key
                    indent = current_depth if current_depth > 1 else current_depth - 1
                    print('\t' * indent, key_to_print)
                    if type(input_dict.get(key)) != dict:
                        print('\t' * (indent + 1), input_dict.get(key))
                    else:
                        for value in input_dict.get(key):
                            value_to_print = value + ':' if max_depth > current_depth else value
                            print('\t' * (indent + 1), value_to_print)
                            self.list(target,
                                      max_depth,
                                      input_dict=input_dict[key][value],
                                      current_depth=current_depth + 1)
            else:
                indent = current_depth if current_depth > 1 else current_depth - 1
                print('\t' * indent, input_dict)

    def run(self, job_name, cluster_name, config_name, script_path,
            argfile_path, outfile_name, remote_path, local_path, params_list,
            suffix, overwrite):
        """
        This method is used to create a job, validate it and run it on remote nodes

        :param job_name: name of the job to create
        :param cluster_name: cluster on which the job is gonna run
        :param config_name: name of the configuration based on which the job is going to run
        :param script_path: path of the script to be run remotely
        :param argfile_path: path of the file that has to be passed to the file as an argument if any
        :param outfile_name: ouput filename resulted from running the script , if any
        :param remote_path: path in the remotes on which the script is gonna be copied to and ran from
        :param local_path: local path to which the results are gonna be copied
        :param params_list: list of the parameters that are going to be passed to the script if any
        :param suffix: suffix of the filenames in the job
        :param overwrite: if the job already exists, this flag overwrites the previous job with the same name
        :return:
        """
        if params_list is None:
            raise ValueError(
                'param-list is not set. This value determines how many instance of the target application \
            will run remotely. Therefore, even if the parameter is empty, add commas for every run you expect.'
            )
        if self.vcluster_config.get('job-metadata') is not None and job_name in \
                list(self.vcluster_config.get('job-metadata').keys()) and overwrite is False:
            raise RuntimeError(
                "The job {} exists in the configuration file, if you want to overwrite the job, \
            use --overwrite argument.".format(job_name))
        self.virt_cluster = self.vcluster_config.get(
            'virtual-cluster')[cluster_name]
        self.runtime_config = self.vcluster_config.get(
            'runtime-config')[config_name]
        job_metadata = {job_name: {}}
        job_metadata[job_name]['suffix'] = suffix
        job_metadata[job_name]['cluster_name'] = cluster_name
        job_metadata[job_name]['config_name'] = config_name
        job_metadata[job_name]['raw_remote_path'] = remote_path
        job_metadata[job_name]['script_path'] = os.path.abspath(script_path)
        job_metadata[job_name]['argfile_path'] = argfile_path
        job_metadata[job_name]['argfile_name'] = ntpath.basename(argfile_path)
        if len(job_metadata[job_name]['argfile_name']) > 0:
            job_metadata[job_name]['params_list'] = ['{} {}'.format(job_metadata[job_name]['argfile_name'], x) \
                                                     for x in params_list]
        else:
            job_metadata[job_name]['params_list'] = params_list
        job_metadata[job_name]['outfile_name'] = outfile_name
        job_metadata[job_name]['script_name'] = ntpath.basename(script_path)
        job_metadata[job_name]['script_name_with_suffix'] = self.add_suffix_to_path(job_metadata[job_name] \
                                                                                        ['script_name'], suffix)
        job_metadata[job_name]['remote_path'] = os.path.join(
            remote_path, 'job' + suffix, '')
        job_metadata[job_name]['remote_script_path'] = os.path.join(
            job_metadata[job_name]['remote_path'],
            job_metadata[job_name]['script_name_with_suffix'])
        job_metadata[job_name]['local_path'] = local_path
        self.job_metadata = job_metadata[job_name]
        self._config_validator()
        self.vcluster_config.deep_set(['job-metadata'], job_metadata)
        all_pids = Manager().list()
        all_jobs = [(self, '_run_remote_job_in_parallel', job_metadata[job_name], param_idx, param, all_pids) for \
                    param_idx, param in enumerate(job_metadata[job_name]['params_list'])]
        pool = Pool(processes=self.runtime_config['proc_num'])
        pool.map(self._execute_in_parallel, all_jobs)
        self.all_pids = all_pids
        self.vcluster_config.deep_set(['job-metadata', job_name, 'nodes-pids'],
                                      [pid for pid in all_pids])
        if not self.runtime_config['download-later']:
            pool = Pool(processes=self.runtime_config['download_proc_num'])
            print("collecting results")
            while len(all_pids) > 0:
                time.sleep(3)
                all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata[job_name],
                                     node_pid_tuple, all_pids) for node_pid_tuple in \
                                    job_metadata[job_name]['nodes-pids'] if node_pid_tuple in all_pids]
                pool.map(self._execute_in_parallel, all_running_jobs)
                print("waiting for other results if any...")

            print("All of the remote results collected.")

    def set_param(self, target, name, parameter, value):
        """
        Used to set a specific parameter in the configuration

        :param target: the entity type on which the parameter is going to be set, e.g. runtime-config
        :param name: the entity name on which the parameter is going to be set, e.g. test-config32
        :param parameter: name of the parameter to be set
        :param value: value of that parameter to be set
        :return:
        """
        if target == 'virtual-cluster':
            self.vcluster_config.deep_set(['virtual-cluster', name, parameter],
                                          value)
            print(
                "Virtual-cluster parameter {} set to {} successfully.".format(
                    parameter, value))
        elif target == 'runtime-config':
            self.vcluster_config.deep_set(['runtime-config', name, parameter],
                                          value)
            print("Runtime-configuration parameter {} set to {} successfully.".
                  format(parameter, value))
        else:
            raise ValueError("Target of variable set not found.")
예제 #4
0
파일: Batch.py 프로젝트: juaco77/cm
class SlurmCluster(object):
    def __init__(self, debug):
        """
        Initializes the SlurmCluster class

        :param debug: switch the debug information on and off
        """
        current_path = os.path.dirname(os.path.realpath(__file__))
        self.workspace = os.path.join(current_path,
                                      "batch_workspace/slurm_batch.yaml")
        if not os.path.exists(os.path.dirname(self.workspace)):
            os.makedirs(os.path.dirname(self.workspace))
        self.cm_config = Config()
        self.batch_config = GenericConfig(self.workspace)
        self.debug = debug
        self.all_jobIDs = []
        self.slurm_cluster = {}
        self.job_metadata = {}

    def create(self,
               job_name,
               slurm_cluster_name,
               slurm_script_path,
               input_type,
               job_script_path,
               argfile_path,
               remote_path,
               local_path,
               suffix,
               overwrite=False):
        """
        This method is used to create a job for running on remote slurm cluster

        :param job_name: name of the job to create
        :param slurm_cluster_name: slurm cluster on which the job is gonna run
        :param slurm_script_path: path of the slurm script
        :param input_type: type of the input for the script that is going to be run on remote cluster, possible values: params, params+file
        :param job_script_path: path of the file that is going to be run on the cluster via slurm script
        :param argfile_path: path of the file that has to be passed to the file as an argument if any
        :param remote_path: path in the remotes on which the scripts is gonna be copied to and ran from
        :param local_path: local path to which the results are gonna be copied
        :param suffix: suffix of the filenames in the job
        :param overwrite: if the job already exists, this flag overwrites the previous job with the same name
        :return:
        """
        if self.batch_config.get('job-metadata') is not None and job_name in \
                list(self.batch_config.get('job-metadata').keys()) and overwrite is False:
            raise RuntimeError(
                "The job {} exists in the configuration file, if you want to overwrite the job, \
            use --overwrite argument.".format(job_name))
        slurm_cluster = self.cm_config.get('cloudmesh').get(
            'cluster')[slurm_cluster_name]
        tmp_cluster = {slurm_cluster_name: dict(slurm_cluster)}
        self.batch_config.deep_set(['slurm_cluster'], tmp_cluster)
        job_metadata = {job_name: {}}
        job_metadata[job_name]['suffix'] = suffix
        job_metadata[job_name]['slurm_cluster_name'] = slurm_cluster_name
        job_metadata[job_name]['input_type'] = input_type
        job_metadata[job_name]['raw_remote_path'] = remote_path
        job_metadata[job_name]['slurm_script_path'] = os.path.abspath(
            slurm_script_path)
        job_metadata[job_name]['job_script_path'] = os.path.abspath(
            job_script_path)
        job_metadata[job_name]['argfile_path'] = os.path.abspath(argfile_path)
        job_metadata[job_name]['argfile_name'] = ntpath.basename(argfile_path)
        job_metadata[job_name]['script_name'] = ntpath.basename(
            job_script_path)
        job_metadata[job_name]['slurm_script_name'] = ntpath.basename(
            slurm_script_path)
        job_metadata[job_name]['remote_path'] = os.path.join(
            remote_path, 'job' + suffix, '')
        job_metadata[job_name]['remote_script_path'] = os.path.join(
            job_metadata[job_name]['remote_path'],
            job_metadata[job_name]['script_name'])
        job_metadata[job_name]['remote_slurm_script_path'] = os.path.join(
            job_metadata[job_name]['remote_path'],
            job_metadata[job_name]['slurm_script_name'])
        job_metadata[job_name]['local_path'] = local_path
        self.job_metadata = job_metadata[job_name]
        self.batch_config.deep_set(['job-metadata'], job_metadata)

    @staticmethod
    def _execute_in_parallel(func_args):
        """
        This is a method used for running methods in parallel

        :param func_args:
        :return:
        """
        target_class = func_args[0]
        method_to_call = getattr(target_class, func_args[1])
        args = list(func_args[2:])
        return method_to_call(*args)

    def _fetch_results_in_parallel(self, job_metadata, job_id, all_job_ids):
        """
        This method is used to fetch the results from remote nodes.

        :param job_metadata: the dictionary containing the information about the previously submitted job
        :param job_id: the tuple containing destination node, destination pid and destination node index when the job was submitted
        :param all_job_ids:
        :return:
        """
        dest_node_info = self.slurm_cluster
        dest_job_id = job_id
        ssh_caller = lambda *x: self._ssh(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \
                                                                                         ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(dest_node_info['name'], os.path.expanduser(dest_node_info['credentials'] \
                                                                                         ['sshconfigpath']), *x)
        ps_output = ssh_caller("qstat -u $USER | grep %s" % job_id)
        if len(ps_output) == 0 or ' c ' in ps_output.lower():

            if not os.path.exists(job_metadata['local_path']):
                os.makedirs(job_metadata['local_path'])
            scp_caller(
                '-r', '%s:%s' %
                (dest_node_info['name'], job_metadata['remote_path']),
                os.path.join(job_metadata['local_path'], ''))
            os.remove(
                os.path.join(
                    job_metadata['local_path'],
                    os.path.basename(
                        os.path.normpath(job_metadata['remote_path'])),
                    job_metadata['script_name']))
            os.remove(
                os.path.join(
                    job_metadata['local_path'],
                    os.path.basename(
                        os.path.normpath(job_metadata['remote_path'])),
                    job_metadata['slurm_script_name']))
            if job_metadata['input_type'] == 'params+file':
                os.remove(
                    os.path.join(
                        job_metadata['local_path'],
                        os.path.basename(
                            os.path.normpath(job_metadata['remote_path'])),
                        job_metadata['argfile_name']))
            all_job_ids.remove(dest_job_id)
            print("Results collected from %s for jobID %s" %
                  (dest_node_info['name'], dest_job_id))

    @staticmethod
    def _ssh(hostname, sshconfigpath, *args):
        """
        This method is used to create remove ssh connections

        :param hostname: hostname
        :param sshconfigpath: path to sshconfig for connecting to remote node
        :param args: the argument to be submitted via ssh
        :return:
        """
        hide_errors_flag = False
        if type(args[-1]) == bool:
            hide_errors_flag = True
            args = args[:-1]
        ssh = subprocess.Popen(["ssh", hostname, '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        result = ssh.stdout.readline()
        if not result:
            error = ssh.stderr.readlines()
            if len(error) > 0 and hide_errors_flag == False:
                print("ERROR in host %s: %s" % (hostname, error))
            return []
        else:
            try:
                return ''.join([chr(x) for x in result])
            except AttributeError:
                return [result.decode('utf-8').strip('\n')]

    @staticmethod
    def _scp(hostname, sshconfigpath, *args):
        """
        This method is used for scp from and to remote

        :param hostname: hostname
        :param sshconfigpath: ssh config file
        :param args:arguments for using while copying
        :return:
        """
        ssh = subprocess.Popen(["scp", '-F', sshconfigpath, *args],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
        middle_result = ssh.stdout.readlines()
        if not middle_result:
            error = ssh.stderr.readlines()
            if len(error) > 0:
                print("ERROR in host %s: %s" % (hostname, error))

    @staticmethod
    def add_suffix_to_path(path, suffix):
        """
        This method is used to add suffix to a path

        :param path: path
        :param suffix: suffix
        :return:
        """
        dir_path = os.path.dirname(path)
        full_filename = os.path.basename(path)
        filename, fileextention = os.path.splitext(full_filename)
        full_filename_new = filename + suffix + fileextention
        new_path = os.path.join(dir_path, full_filename_new)
        return new_path

    def clean_remote(self, job_name):
        """
        This method is used to spawn processes for cleaning the remote nodes

        :param job_name: name of previously submitted job for which the nodes are going to be cleaned
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        target_cluster_info = self.batch_config.get('slurm_cluster')[
            job_metadata['slurm_cluster_name']]
        remote_path = job_metadata['remote_path']

        ssh_caller = lambda *x: self._ssh(target_cluster_info['name'],
                                          os.path.expanduser(target_cluster_info['credentials'] \
                                                                 ['sshconfigpath']), *x, True)
        ssh_caller('rm -rf {}'.format(remote_path))
        if len(ssh_caller('ls {}'.format(remote_path))) == 0:
            print("Job {} cleaned successfully.".format(job_name))
        else:
            print("Error: Job {} could not be cleaned.".format(job_name))

    def connection_test(self, slurm_cluster_name):
        """
        This method is used for testing the connection to the slurm cluster connection node

        :param slurm_cluster_name: name of the slurm cluster which is going to be tested
        :return:
        """
        target_node_info = self.batch_config.get(
            'slurm_cluster')[slurm_cluster_name]
        ssh_caller = lambda *x: self._ssh(target_node_info['name'], os.path.expanduser(target_node_info['credentials'] \
                                                                                           ['sshconfigpath']), *x)
        if len(ssh_caller('uname -a')) > 0:
            print("Slurm Cluster {} is accessible.".format(
                target_node_info['name']))
        else:
            print("Error: Slurm Cluster {} cannot be accessed.".format(
                target_node_info['name']))

    def remove(self, target, key):
        """
        Used to remove virtual clusters and runtime configs

        :param target: type of entity to be removed
        :param key: keyname of the entity to be removed
        :return:
        """
        if target == 'slurm-cluster':
            self.batch_config.remove(['slurm_cluster'], key)
            print("Slurm-cluster {} removeed successfully.".format(key))
        elif target == 'job':
            self.batch_config.remove(['job-metadata'], key)
            print("Job {} removeed successfully.".format(key))
        else:
            raise ValueError("Target of removeing not found.")

    def fetch(self, job_name):
        """
        This method is used to fetch resutls from remote nodes

        :param job_name: the previously submitted job name
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        self.slurm_cluster = self.batch_config.get('slurm_cluster')[
            job_metadata['slurm_cluster_name']]
        loaded_all_job_ids = [x for x in job_metadata['jobIDs']]
        all_job_ids = Manager().list()
        all_job_ids.extend(loaded_all_job_ids)
        pool = Pool(processes=1)
        print("collecting results")
        while len(all_job_ids) > 0:
            time.sleep(1)
            all_running_jobs = [(self, '_fetch_results_in_parallel', job_metadata, jobID, all_job_ids) for \
                                jobID in loaded_all_job_ids if jobID in all_job_ids]
            pool.map(self._execute_in_parallel, all_running_jobs)
            print("waiting for other results if any...")
        print("All of the remote results collected.")

    def list(self, target, max_depth, current_depth=1, input_dict=None):
        """
        listing the target slurm clusters or job-metadata

        :param target: name of the virtual cluster to be listed
        :param max_depth: depth of information to be shown
        :param current_depth: current depth of printing information
        :param input_dict: used for recursion for depth of higher than 1
        :return:
        """
        if target == 'slurm-clusters' and input_dict is None:
            input_dict = self.batch_config.get('slurm_cluster')
        if target == 'jobs' and input_dict is None:
            input_dict = self.batch_config.get('job-metadata')
        elif input_dict is None:
            raise ValueError("Target of listing not found.")

        if max_depth >= current_depth:
            if type(input_dict) == dict:
                for key in input_dict:
                    key_to_print = key + ':' if max_depth >= current_depth else key
                    indent = current_depth if current_depth > 1 else current_depth - 1
                    print('\t' * indent, key_to_print)
                    if type(input_dict.get(key)) != dict:
                        print('\t' * (indent + 1), input_dict.get(key))
                    else:
                        for value in input_dict.get(key):
                            value_to_print = value + ':' if max_depth > current_depth else value
                            print('\t' * (indent + 1), value_to_print)
                            self.list(target,
                                      max_depth,
                                      input_dict=input_dict[key][value],
                                      current_depth=current_depth + 1)
            else:
                indent = current_depth if current_depth > 1 else current_depth - 1
                print('\t' * indent, input_dict)

    def run(self, job_name):
        """
        This method is used to create a job, validate it and run it on remote nodes

        :param job_name: name of the job to create
        :return:
        """
        job_metadata = self.batch_config.get('job-metadata')[job_name]
        all_job_ids = Manager().list()
        cluster_name = job_metadata['slurm_cluster_name']
        slurm_cluster = self.batch_config.get('slurm_cluster').get(
            cluster_name)
        ssh_caller = lambda *x: self._ssh(slurm_cluster['name'], os.path.expanduser(slurm_cluster['credentials'] \
                                                                                        ['sshconfigpath']), *x)
        scp_caller = lambda *x: self._scp(slurm_cluster['name'], os.path.expanduser(slurm_cluster['credentials'] \
                                                                                        ['sshconfigpath']), *x)
        ssh_caller(
            'cd %s && mkdir job%s' %
            (job_metadata['raw_remote_path'], job_metadata['suffix']), True)
        scp_caller(
            job_metadata['slurm_script_path'], '%s:%s' %
            (slurm_cluster['name'], job_metadata['remote_slurm_script_path']))
        scp_caller(
            job_metadata['job_script_path'], '%s:%s' %
            (slurm_cluster['name'], job_metadata['remote_script_path']))
        ssh_caller('chmod +x', job_metadata['remote_script_path'])
        if job_metadata['input_type'].lower() == 'params+file':
            scp_caller(
                job_metadata['argfile_path'],
                '%s:%s' % (slurm_cluster['name'], job_metadata['remote_path']))

        remote_job_id = ssh_caller(
            "cd %s && qsub %s && qstat -u $USER | tail -n 1 | awk '{print $1}'"
            % (job_metadata['remote_path'],
               job_metadata['remote_slurm_script_path']))
        remote_job_id = remote_job_id.strip('\n')
        all_job_ids.append(remote_job_id)
        print('Remote job ID: %s' % remote_job_id)
        self.batch_config.deep_set(['job-metadata', job_name, 'jobIDs'],
                                   [pid for pid in all_job_ids])

    def set_param(self, target, name, parameter, value):
        """
        Used to set a specific parameter in the configuration

        :param target: the entity type on which the parameter is going to be set, e.g. runtime-config
        :param name: the entity name on which the parameter is going to be set, e.g. test-config32
        :param parameter: name of the parameter to be set
        :param value: value of that parameter to be set
        :return:
        """
        if target == 'slurm-cluster':
            self.batch_config.deep_set(['slurm_cluster', name, parameter],
                                       value)
            print("slurm-cluster parameter {} set to {} successfully.".format(
                parameter, value))
        elif target == 'job-metadata':
            self.batch_config.deep_set(['job-metadata', name, parameter],
                                       value)
            print("Job-metadata parameter {} set to {} successfully.".format(
                parameter, value))
        else:
            raise ValueError("Target of variable set not found.")