Ejemplo n.º 1
0
 def _launch_block(self, external_block_id: str) -> Any:
     if self.launch_cmd is None:
         raise ScalingFailed(self.provider.label, "No launch command")
     launch_cmd = self.launch_cmd.format(block_id=external_block_id)
     internal_block = self.provider.submit(launch_cmd, 1)
     logger.debug("Launched block {}->{}".format(external_block_id, internal_block))
     if not internal_block:
         raise(ScalingFailed(self.provider.label,
                             "Attempts to provision nodes via provider has failed"))
     return internal_block
Ejemplo n.º 2
0
    def scale_out(self, blocks=1):
        """Scales out the number of active workers by 1.

        This method is notImplemented for threads and will raise the error if called.

        Parameters:
            blocks : int
               Number of blocks to be provisioned.
        """
        r = []
        for i in range(blocks):
            if self.provider:
                block = self.provider.submit(self.launch_cmd, 1,
                                             self.workers_per_node)
                logger.debug("Launched block {}:{}".format(i, block))
                if not block:
                    raise (ScalingFailed(
                        self.provider.label,
                        "Attempts to provision nodes via provider has failed"))
                self.engines.extend([block])
                r.extend([block])
        else:
            logger.error("No execution provider available")
            r = None

        return r
Ejemplo n.º 3
0
    def scale_out(self, blocks=1):
        """Scales out the number of active workers by the number of blocks specified.

        Parameters
        ----------

        blocks : int
             # of blocks to scale out. Default=1

        Raises:
             NotImplementedError
        """
        r = []
        for i in range(blocks):
            if self.provider:
                block = self.provider.submit(self.launch_cmd,
                                             self.workers_per_node)
                logger.debug("Launched block {}:{}".format(i, block))
                if not block:
                    raise (ScalingFailed(
                        self.provider.label,
                        "Attempts to provision nodes via provider has failed"))
                self.blocks.extend([block])
            else:
                logger.error("No execution provider available")
                r = None
        return r
Ejemplo n.º 4
0
    def scale_out(self, blocks=1, task_type=None):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        r = []
        for i in range(blocks):
            if self.config.provider:
                self._block_counter += 1
                external_block_id = str(self._block_counter)
                if not task_type and self.config.scheduler_mode == 'hard':
                    launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type='RAW')
                else:
                    launch_cmd = self.launch_cmd.format(block_id=external_block_id, worker_type=task_type)
                if not task_type:
                    internal_block = self.config.provider.submit(launch_cmd, 1)
                else:
                    internal_block = self.config.provider.submit(launch_cmd, 1, task_type)
                logger.debug("Launched block {}->{}".format(external_block_id, internal_block))
                if not internal_block:
                    raise(ScalingFailed(self.provider.label,
                                        "Attempts to provision nodes via provider has failed"))
                self.blocks[external_block_id] = internal_block
                self.block_id_map[internal_block] = external_block_id
            else:
                logger.error("No execution provider available")
                r = None
        return r
Ejemplo n.º 5
0
def _submit_flux_jobs(
    submission_queue: queue.Queue,
    stop_event: threading.Event,
    socket: zmq.Socket,
    working_dir: str,
    flux_executor_kwargs: Mapping,
    provider: ExecutionProvider,
    executor: FluxExecutor,
    flux_path: str,
    launch_cmd: str,
):
    """Function to be run in a separate thread by executor.

    Pull ``_FluxJobInfo`` job packages from a queue and submit them to Flux.
    """
    provider.script_dir = working_dir  # type: ignore
    job_id = provider.submit(
        launch_cmd.format(
            port=socket.bind_to_random_port("tcp://*"),
            protocol="tcp",
            hostname=gethostname(),
            python=sys.executable,
            flux=flux_path,
            manager=_MANAGER_PATH,
        ),
        1,
    )
    if not job_id:
        raise ScalingFailed(
            executor,
            "Attempt to provision nodes via provider has failed",
        )
    # wait for the flux package path to be sent
    _check_provider_job(socket, provider, job_id)
    # receive path to the ``flux`` package from the ZMQ socket
    flux_pkg_path = socket.recv().decode()  # type: ignore
    # load the package. Unfortunately the only good way to do this is to
    # modify sys.path
    if flux_pkg_path not in sys.path:
        sys.path.append(flux_pkg_path)
    import flux.job

    socket.send(b"ack")  # dummy message
    # receive the URI of the Flux instance launched by provider
    _check_provider_job(socket, provider, job_id)
    flux_instance_uri = socket.recv()
    # create a ``flux.job.FluxExecutor`` connected to remote Flux instance
    with flux.job.FluxExecutor(handle_args=(flux_instance_uri, ),
                               **flux_executor_kwargs) as flux_executor:
        # need to ensure that no jobs submitted after stop_event set
        # exit loop when event is set and queue is drained
        while not stop_event.is_set() or not submission_queue.empty():
            try:
                jobinfo = submission_queue.get(timeout=0.05)
            except queue.Empty:
                pass
            else:
                _submit_single_job(flux_executor, working_dir, jobinfo)
    socket.send(b"shutdown")
Ejemplo n.º 6
0
 def _launch_block(self, block_id: str) -> Any:
     launch_cmd = self._get_launch_command(block_id)
     job_id = self.provider.submit(launch_cmd, 1)
     if job_id:
         logger.debug(
             f"Launched block {block_id} on executor {self.label} with job ID {job_id}"
         )
     else:
         raise ScalingFailed(
             self, "Attempt to provision nodes did not return a job ID")
     return job_id
Ejemplo n.º 7
0
    def start(self):
        """Create the Interchange process and connect to it.
        """
        self.outgoing_q = zmq_pipes.TasksOutgoing("127.0.0.1",
                                                  self.interchange_port_range)
        self.incoming_q = zmq_pipes.ResultsIncoming(
            "127.0.0.1", self.interchange_port_range)

        self.is_alive = True

        self._queue_management_thread = None
        self._start_queue_management_thread()
        self._start_local_queue_process()

        logger.debug("Created management thread: {}".format(
            self._queue_management_thread))

        if self.provider:
            # debug_opts = "--debug" if self.worker_debug else ""
            l_cmd = self.launch_cmd.format(  # debug=debug_opts,
                task_url=self.worker_task_url,
                workers_per_node=self.workers_per_node,
                logdir="{}/{}".format(self.run_dir, self.label))
            self.launch_cmd = l_cmd
            logger.debug("Launch command: {}".format(self.launch_cmd))

            self._scaling_enabled = True
            logger.debug("Starting LowLatencyExecutor with provider:\n%s",
                         self.provider)
            if hasattr(self.provider, 'init_blocks'):
                try:
                    for i in range(self.provider.init_blocks):
                        block = self.provider.submit(self.launch_cmd,
                                                     self.workers_per_node)
                        logger.debug("Launched block {}:{}".format(i, block))
                        if not block:
                            raise (ScalingFailed(
                                self.provider.label,
                                "Attempts to provision nodes via provider has failed"
                            ))
                        self.blocks.extend([block])

                except Exception as e:
                    logger.error("Scaling out failed: {}".format(e))
                    raise e
        else:
            self._scaling_enabled = False
            logger.debug("Starting LowLatencyExecutor with no provider")
Ejemplo n.º 8
0
    def scale_out(self, blocks=1):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        r = []
        for i in range(blocks):
            external_block_id = str(len(self.blocks))
            launch_cmd = self.launch_cmd.format(block_id=external_block_id)
            internal_block = self.provider.submit(launch_cmd, 1)
            logger.debug("Launched block {}->{}".format(external_block_id, internal_block))
            if not internal_block:
                raise(ScalingFailed(self.provider.label,
                                    "Attempts to provision nodes via provider has failed"))
            r.extend([external_block_id])
            self.blocks[external_block_id] = internal_block
        return r
Ejemplo n.º 9
0
    def scale_out(self, blocks=1):
        """Scale out method.

        We should have the scale out method simply take resource object
        which will have the scaling methods, scale_out itself should be a coroutine, since
        scaling tasks can be slow.
        """
        if self.provider:
            for i in range(blocks):
                external_block = str(len(self.blocks))
                internal_block = self.provider.submit(self.worker_command, 1)
                # Failed to create block with provider
                if not internal_block:
                    raise(ScalingFailed(self.provider.label, "Attempts to create nodes using the provider has failed"))
                else:
                    self.blocks[external_block] = internal_block
        else:
            logger.error("No execution provider available to scale")
Ejemplo n.º 10
0
    def scale_out(self, blocks=1):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        if not self.provider:
            raise (ScalingFailed("No execution provider available"))
        r = []
        for i in range(blocks):
            external_block_id = str(len(self.blocks))
            try:
                self.blocks[external_block_id] = self._launch_block(external_block_id)
                r.append(external_block_id)
            except Exception as ex:
                self._fail_job_async(external_block_id,
                                     "Failed to start block {}: {}".format(external_block_id, ex))
        return r
Ejemplo n.º 11
0
 def scale_out(self, blocks=1):
     """Scales out the number of blocks by "blocks"
     """
     if not self.provider:
         raise (ScalingFailed(None, "No execution provider available"))
     block_ids = []
     for i in range(blocks):
         block_id = str(len(self.blocks))
         try:
             job_id = self._launch_block(block_id)
             self.blocks[block_id] = job_id
             self.block_mapping[job_id] = block_id
             block_ids.append(block_id)
         except Exception as ex:
             self._fail_job_async(
                 block_id,
                 "Failed to start block {}: {}".format(block_id, ex))
     return block_ids
Ejemplo n.º 12
0
 def scale_out(self, blocks: int = 1) -> List[str]:
     """Scales out the number of blocks by "blocks"
     """
     if not self.provider:
         raise ScalingFailed(self, "No execution provider available")
     block_ids = []
     logger.info(f"Scaling out by {blocks} blocks")
     for i in range(blocks):
         block_id = str(self._block_id_counter.get_id())
         logger.info(f"Allocated block ID {block_id}")
         try:
             job_id = self._launch_block(block_id)
             self.blocks[block_id] = job_id
             self.block_mapping[job_id] = block_id
             block_ids.append(block_id)
         except Exception as ex:
             self._fail_job_async(
                 block_id,
                 "Failed to start block {}: {}".format(block_id, ex))
     return block_ids
Ejemplo n.º 13
0
    def scale_out(self, blocks=1):
        """Scales out the number of blocks by "blocks"

        Raises:
             NotImplementedError
        """
        r = []
        for i in range(blocks):
            if self.provider:
                block = self.provider.submit(self.launch_cmd, 1, 1)
                log.debug(f"Launched block {i}:{block}")
                if not block:
                    raise (ScalingFailed(
                        self.provider.label,
                        "Attempts to provision nodes via provider has failed",
                    ))
                self.blocks.extend([block])
            else:
                log.error("No execution provider available")
                r = None
        return r
Ejemplo n.º 14
0
 def _get_launch_command(self, block_id: str) -> str:
     if self.launch_cmd is None:
         raise ScalingFailed(self, "No launch command")
     launch_cmd = self.launch_cmd.format(block_id=block_id)
     return launch_cmd