Ejemplo n.º 1
0
    def __init__(self, config, channel=None):
        ''' Initialize the Slurm class

        Args:
             - Config (dict): Dictionary with all the config options.

        KWargs:
             - Channel (None): A channel is required for slurm.
        '''

        self.channel = channel
        if self.channel == None:
            logger.error(
                "Provider:Slurm cannot be initialized without a channel")
            raise (ep_error.ChannelRequired(
                self.__class__.__name__,
                "Missing a channel to execute commands"))
        self.config = config
        self.sitename = config['site']
        self.current_blocksize = 0
        launcher_name = self.config["execution"]["block"].get(
            "launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)
        self.scriptDir = self.config["execution"]["scriptDir"]
        if not os.path.exists(self.scriptDir):
            os.makedirs(self.scriptDir)

        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
Ejemplo n.º 2
0
    def __init__(self, config, channel=None):
        ''' Here we do initialization that is common across all cluster-style providers

        Args:
             - Config (dict): Dictionary with all the config options.

        KWargs:
             - Channel (None): A channel is required for all cluster-style providers
        '''
        self._scaling_enabled = True
        self._channels_required = True
        self.channel = channel
        if self.channel is None:
            logger.error("Provider: Cannot be initialized without a channel")
            raise (ep_error.ChannelRequired(
                self.__class__.__name__,
                "Missing a channel to execute commands"))
        self.config = config
        self.sitename = config['site']
        self.current_blocksize = 0
        launcher_name = self.config["execution"]["block"].get(
            "launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)
        self.max_walltime = wtime_to_minutes(
            self.config["execution"]["block"].get("walltime", '01:00:00'))

        self.scriptDir = self.config["execution"]["scriptDir"]
        if not os.path.exists(self.scriptDir):
            os.makedirs(self.scriptDir)

        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
Ejemplo n.º 3
0
    def __init__(self, config, channel_script_dir=None, channel=None):
        ''' Initialize the local provider class

        This provider is unique because the `LocalChannel` is simple enough
        that a default can be provided. For this reason users can pass `channel=None`,
        and a default `LocalChannel` will be created.

        Args:
             - Config (dict): Dictionary with all the config options.
             - channel_script_dir (str): Script directory which will be
                   passed to the default `LocalChannel` (this will have
                   no effect if a `channel` is not None)
             - channel (Channel): Channel to use; if none is provided, a
                   default one will be created
        '''

        if channel is None:
            if channel_script_dir is None:
                self.channel = LocalChannel()
            else:
                self.channel = LocalChannel(scriptDir=channel_script_dir)
        else:
            self.channel = channel
        self.config = config
        self.sitename = config['site']
        self.current_blocksize = 0
        self.scriptDir = self.config["execution"]["scriptDir"]
        self.taskBlocks = self.config["execution"]["block"].get("taskBlocks", 1)
        launcher_name = self.config["execution"]["block"].get("launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)

        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
Ejemplo n.º 4
0
    def submit(self, cmd_string, blocksize, job_name="parsl.auto"):
        ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements.
        Submit returns an ID that corresponds to the task that was just submitted.

        If tasks_per_node <  1:
             1/tasks_per_node is provisioned

        If tasks_per_node == 1:
             A single node is provisioned

        If tasks_per_node >  1 :
             tasks_per_node * blocksize number of nodes are provisioned.

        Args:
             - cmd_string  :(String) Commandline invocation to be made on the remote side.
             - blocksize   :(float) - Not really used for local

        Kwargs:
             - job_name (String): Name for job, must be unique

        Returns:
             - None: At capacity, cannot provision more
             - job_id: (string) Identifier for the job

        '''

        job_name = "{0}.{1}".format(job_name, time.time())

        # Set script path
        script_path = "{0}/{1}.sh".format(self.scriptDir, job_name)
        script_path = os.path.abspath(script_path)

        lname = self.config["execution"]["block"].get("launcher", "singleNode")
        launcher = Launchers.get(lname, None)
        wrap_cmd_string = self.launcher(cmd_string, taskBlocks=self.taskBlocks)

        ret = self._write_submit_script(wrap_cmd_string, script_path)

        job_id, proc = execute_no_wait('bash {0}'.format(script_path), 3)
        self.resources[job_id] = {
            'job_id': job_id,
            'status': 'RUNNING',
            'blocksize': blocksize,
            'proc': proc
        }

        return job_id
Ejemplo n.º 5
0
    def __init__(self, config, channel=None):
        ''' Initialize the GoogleCompute class

        Args:
             - Config (dict): Dictionary with all the config options.

        KWargs:
             - Channel (None): A channel is not required for google cloud.
             
        Google compute instances require a few specific configuration options:
            - auth['keyfile'](string): Path to authorization private key json file. 
                                       This is required for auth. A new one can be 
                                       generated here: 
                                       https://console.cloud.google.com/apis/credentials
            - options['projectID'](string): Project ID from google compute engine
            - options['region'](string): Region in which to start instances
            - options['instanceType'](string): google instance type. Default:"n1-standard-1"
            - options['osProject'](string): OS project code for google compute engine
            - options['osFamily'](string): OS family to request
            - options['googleVersion'](string): Google compute engine version to use ('v1' or 'beta')
        '''
        self.config = config
        self.sitename = config['site']
        self.options = self.config["execution"]["block"]["options"]
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config["auth"][
            "keyfile"]
        version = self.options.get('googleVersion', 'v1')
        self.client = googleapiclient.discovery.build('compute', version)
        self.channel = None
        self.project_id = self.config["execution"]["block"]["options"][
            "projectID"]
        self.zone = self.get_correct_zone(
            self.config["execution"]["block"]["options"]["region"])
        launcher_name = self.config["execution"]["block"].get(
            "launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)
        self.scriptDir = self.config["execution"].get("scriptDir", ".scripts")
        self.name_int = 0
        if not os.path.exists(self.scriptDir):
            os.makedirs(self.scriptDir)

        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
        self.current_blocksize = 0
        atexit.register(self.bye)
Ejemplo n.º 6
0
    def __init__(self, config, channel_script_dir=None, channel=None):
        ''' Initialize the local provider class

        Args:
             - Config (dict): Dictionary with all the config options.
        '''

        self.channel = channel
        self.config = config
        self.sitename = config['site']
        self.current_blocksize = 0
        self.scriptDir = self.config["execution"]["scriptDir"]
        self.taskBlocks = self.config["execution"]["block"].get(
            "taskBlocks", 1)
        launcher_name = self.config["execution"]["block"].get(
            "launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)

        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
Ejemplo n.º 7
0
    def __init__(self, config, channel=None):
        ''' Initialize the GridEngine class

        Args:
             - Config (dict): Dictionary with all the config options.

        KWargs:
             - Channel (None): A channel is required for slurm.
        '''
        self.channel = channel
        self.config = config
        self.sitename = config['site']
        self.current_blocksize = 0
        launcher_name = self.config["execution"]["block"].get("launcher", "singleNode")
        self.launcher = Launchers.get(launcher_name, None)
        self.scriptDir = self.config["execution"]["scriptDir"]
        if not os.path.exists(self.scriptDir):
            os.makedirs(self.scriptDir)
        # Dictionary that keeps track of jobs, keyed on job_id
        self.resources = {}
        atexit.register(self.bye)
Ejemplo n.º 8
0
    def submit (self, cmd_string, blocksize, job_name="parsl.auto"):
        ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements.
        Submit returns an ID that corresponds to the task that was just submitted.

        If tasks_per_node <  1 : ! This is illegal. tasks_per_node should be integer

        If tasks_per_node == 1:
             A single node is provisioned

        If tasks_per_node >  1 :
             tasks_per_node * blocksize number of nodes are provisioned.

        Args:
             - cmd_string  :(String) Commandline invocation to be made on the remote side.
             - blocksize   :(float)

        Kwargs:
             - job_name (String): Name for job, must be unique

        Returns:
             - None: At capacity, cannot provision more
             - job_id: (string) Identifier for the job

        '''

        if self.current_blocksize >= self.config["execution"]["block"].get("maxBlocks", 2):
            logger.warn("[%s] at capacity, cannot add more blocks now", self.sitename)
            return None

        # Note: Fix this later to avoid confusing behavior.
        # We should always allocate blocks in integer counts of node_granularity
        if blocksize < self.config["execution"]["block"].get("nodes", 1):
            blocksize = self.config["execution"]["block"].get("nodes",1)


        # Set account options
        account_opt = ''
        if self.config["execution"]["block"]["options"].get("account", None):
            account_opt = "-A {0}".format(self.config["execution"]["block"]["options"]["account"])

        # Set job name
        job_name = "parsl.{0}.{1}".format(job_name,time.time())

        # Set script path
        script_path = "{0}/{1}.submit".format(self.config["execution"]["block"].get("script_dir",'./.scripts'),
                                              job_name)
        script_path = os.path.abspath(script_path)

        # Calculate nodes
        nodes = self.config["execution"]["block"].get("nodes", 1)
        logger.debug("Requesting blocksize:%s nodes:%s taskBlocks:%s", blocksize,
                     nodes,
                     self.config["execution"]["block"].get("taskBlocks", 1))

        job_config = self.config["execution"]["block"]["options"]
        job_config["nodes"] = nodes
        job_config["overrides"] = job_config.get("overrides", '')        

        # Wrap the cmd_string
        lname = self.config["execution"]["block"].get("launcher", "singleNode")
        launcher = Launchers.get(lname, None)        
        job_config["user_script"] = launcher(cmd_string, 
                                             self.config["execution"]["block"]["taskBlocks"])
        
        # Get queue request if requested
        self.queue = ''
        if job_config.get("queue", None):
            self.queue = "-q {0}".format(job_config["queue"])


        logger.debug("Writing submit script")
        ret = self._write_submit_script(template_string, script_path, job_name, job_config)

        channel_script_path = self.channel.push_file(script_path, self.channel.script_dir)

        logger.debug("Executing : qsub -n {0} {1} -t {2} {3} {4}".format(nodes,
                                                                         self.queue,
                                                                         self.max_walltime,
                                                                         account_opt,
                                                                         channel_script_path))
        
        retcode, stdout, stderr = self.channel.execute_wait(
            "qsub -n {0} {1} -t {2} {3} {4}".format(nodes,
                                                    self.queue,
                                                    self.max_walltime,
                                                    account_opt,
                                                    channel_script_path), 5)

        # TODO : FIX this block
        if retcode != 0 :
            logger.error("Launch failed stdout:\n{0}  \nstderr:{1}\n".format(stdout, stderr))
        logger.debug ("Retcode:%s STDOUT:%s STDERR:%s", retcode,
                      stdout.strip(), stderr.strip())

        job_id = None

        if retcode == 0 :
            # We should be getting only one line back
            job_id = stdout.strip()
            self.resources[job_id] = {'job_id' : job_id,
                                      'status' : 'PENDING',
                                      'blocksize'   : blocksize }
        else:
            logger.error("Submission of command to scale_out failed: {0}".format(stderr))
            raise(ep_error.ScaleOutFailed(self.__class__,
                                          "Request to submit job to local scheduler failed"))

        logger.debug("Returning job id : {0}".format(job_id))
        return job_id