def ee_dask_deploy(config, pb_id, image, n_workers=1, buffers=[], secrets=[]): """Deploy Dask execution engine. :param config: configuration DB handle :param pb_id: processing block ID :param image: Docker image to deploy :param n_workers: number of Dask workers :param buffers: list of buffers to mount on Dask workers :param secrets: list of secrets to mount on Dask workers :return: deployment ID and Dask client handle """ # Make deployment deploy_id = "proc-{}-dask".format(pb_id) values = {"image": image, "worker.replicas": n_workers} for i, b in enumerate(buffers): values["buffers[{}]".format(i)] = b for i, s in enumerate(secrets): values["secrets[{}]".format(i)] = s deploy = ska_sdp_config.Deployment( deploy_id, "helm", {"chart": "dask", "values": values} ) for txn in config.txn(): txn.create_deployment(deploy) # Wait for scheduler to become available scheduler = deploy_id + "-scheduler." + os.environ["SDP_HELM_NAMESPACE"] + ":8786" client = None while client is None: try: client = distributed.Client(scheduler, timeout=1) except: pass return deploy_id, client
def create_deployment(config, pb): logger.info("Deploying DALiuGE...") deploy_id = pb.pb_id + "-daliuge" deployment = ska_sdp_config.Deployment(deploy_id, "helm", { 'chart': 'daliuge', }) for txn in config.txn(): txn.create_deployment(deployment) return deployment
def create_deployment(config, pb): logger.info("Deploying DALiuGE...") deploy_id = "proc-{}-daliuge".format(pb.id) deployment = ska_sdp_config.Deployment( deploy_id, "helm", { "chart": "daliuge", }, ) for txn in config.txn(): txn.create_deployment(deployment) return deployment
def _start_workflow(txn, pb_id): """ Start the workflow for a processing block. :param txn: config DB transaction :param pb_id: processing block ID """ LOG.info("Making deployment for processing block %s", pb_id) # Read the processing block pb = txn.get_processing_block(pb_id) # Get workflow type, id and version wf_type = pb.workflow["type"] wf_id = pb.workflow["id"] wf_version = pb.workflow["version"] wf_description = "{} workflow {}, version {}".format(wf_type, wf_id, wf_version) # Get the container image for the workflow workflow = txn.get_workflow(wf_type, wf_id, wf_version) if workflow is None: wf_image = None else: wf_image = workflow.get("image") if wf_image is not None: # Make the deployment LOG.info("Deploying %s", wf_description) deploy_id = "proc-{}-workflow".format(pb_id) values = {} values["env"] = {} for v in ["SDP_CONFIG_HOST", "SDP_HELM_NAMESPACE"]: values["env"][v] = os.environ[v] values["wf_image"] = wf_image values["pb_id"] = pb_id chart = {"chart": "workflow", "values": values} deploy = ska_sdp_config.Deployment(deploy_id, "helm", chart) txn.create_deployment(deploy) # Set status to STARTING, and resources_available to False state = {"status": "STARTING", "resources_available": False} else: # Invalid workflow, so set status to FAILED state = {"status": "FAILED", "reason": "No image for " + wf_description} # Create the processing block state. txn.create_processing_block_state(pb_id, state)
def buffer_create(config, name, size=None): """Create buffer reservation. :param config: configuration DB handle :param name: name :param size: size, uses default in chart if None """ deploy_id = name values = {} if size is not None: values["size"] = size deploy = ska_sdp_config.Deployment(deploy_id, "helm", { "chart": "buffer", "values": values }) for txn in config.txn(): txn.create_deployment(deploy)
def _deploy(self, deploy_name, values=None): """ Deploy the Helm chart. :param deploy_name: deployment name :param values: optional dict of values """ LOG.info("Deploying Helm chart: %s", deploy_name) self._deploy_id = "proc-{}-{}".format(self._pb_id, deploy_name) self.update_deploy_status("RUNNING") chart = { "chart": deploy_name, # Helm chart deploy from the repo } if values is not None: chart["values"] = values deploy = ska_sdp_config.Deployment(self._deploy_id, "helm", chart) for txn in self._config.txn(): txn.create_deployment(deploy)
def main(argv): pb_id = argv[0] for txn in config.txn(): txn.take_processing_block(pb_id, config.client_lease) pb = txn.get_processing_block(pb_id) # Show log.info("Claimed processing block %s", pb) # Deploy PSS Receive with 1 worker. log.info("Deploying PSS Receive...") deploy_id = pb.pb_id + "-pss-receive" deploy = ska_sdp_config.Deployment( deploy_id, "helm", { 'chart': 'pss-receive', # Helm chart deploy/charts/pss-receive }) for txn in config.txn(): txn.create_deployment(deploy) try: # Just idle until processing block or disappears log.info("Done, now idling...") for txn in config.txn(): if not txn.is_processing_block_owner(pb.pb_id): break txn.loop(True) finally: # Clean up pss receive deployment. for txn in config.txn(): txn.delete_deployment(deploy) config.close()
def main(): """Main loop.""" # Get environment variables to pass to workflow containers. values_env = get_environment_variables( ['SDP_CONFIG_HOST', 'SDP_HELM_NAMESPACE']) # Fetch workflow definitions. workflows_version, workflows_realtime, workflows_batch = \ update_workflow_definition(WORKFLOWS_URL, WORKFLOWS_SCHEMA) next_workflows_refresh = time.time() + WORKFLOWS_REFRESH # Connect to configuration database. client = ska_sdp_config.Config() LOG.debug("Starting main loop...") for txn in client.txn(): # Update workflow definitions if it is time to do so. if time.time() >= next_workflows_refresh: LOG.debug('Updating workflow definitions') workflows_version, workflows_realtime, workflows_batch = \ update_workflow_definition(WORKFLOWS_URL, WORKFLOWS_SCHEMA) next_workflows_refresh = time.time() + WORKFLOWS_REFRESH # Get lists of processing blocks and deployments. current_pbs = txn.list_processing_blocks() current_deployments = txn.list_deployments() # Make list of current PBs with deployments, inferred from the deployment IDs. current_pbs_with_deployment = list( set(map(get_pb_id_from_deploy_id, current_deployments))) LOG.debug("Current PBs: {}".format(current_pbs)) LOG.debug("Current deployments: {}".format(current_deployments)) LOG.debug("Current PBs with deployment: {}".format( current_pbs_with_deployment)) # Delete deployments not associated with processing blocks. for deploy_id in current_deployments: # Get ID of associated processing block by taking prefix of deployment ID. pb_id = get_pb_id_from_deploy_id(deploy_id) if pb_id not in current_pbs: LOG.info("Deleting deployment {}".format(deploy_id)) deploy = txn.get_deployment(deploy_id) txn.delete_deployment(deploy) # Deploy workflow for processing blocks without deployments. for pb_id in current_pbs: if pb_id in current_pbs_with_deployment: continue pb = txn.get_processing_block(pb_id) wf_type = pb.workflow['type'] wf_id = pb.workflow['id'] wf_version = pb.workflow['version'] LOG.info( "PB {} has no deployment (workflow type = {}, ID = {}, version = {})" "".format(pb_id, wf_type, wf_id, wf_version)) if wf_type == "realtime": if (wf_id, wf_version) in workflows_realtime: LOG.info( "Deploying realtime workflow ID = {}, version = {}" "".format(wf_id, wf_version)) wf_image = workflows_realtime[(wf_id, wf_version)] deploy_id = "{}-workflow".format(pb_id) # Values to pass to workflow Helm chart. # Copy environment variable values and add argument values. values = dict(values_env) values['wf_image'] = wf_image values['pb_id'] = pb_id deploy = ska_sdp_config.Deployment(deploy_id, 'helm', { 'chart': 'workflow', 'values': values }) LOG.info("Creating deployment {}".format(deploy_id)) txn.create_deployment(deploy) else: # Unknown realtime workflow ID and version. LOG.error("Workflow ID = {} version = {} is not supported". format(wf_id, wf_version)) elif wf_type == "batch": LOG.warning("Batch workflows are not supported at present") else: LOG.error("Unknown workflow type: {}".format(wf_type)) LOG.debug("Waiting...") txn.loop(wait=True, timeout=next_workflows_refresh - time.time())
# # This is done by adding the request to the configuration database, # where it will be picked up and executed by appropriate # controllers. In the full system this will involve external checks # for whether the workflow actually has been assigned enough resources # to do this - and for obtaining such assignments the workflow would # need to communicate with a scheduler process. But we are ignoring # all of that at the moment. log.info("Deploying Dask...") deploy_id = pb.pb_id + "-dask" deploy = ska_sdp_config.Deployment( deploy_id, "helm", { 'chart': 'stable/dask', 'values': { 'jupyter.enabled': 'false', 'worker.replicas': 2, # We want to access Dask in-cluster using a DNS name 'scheduler.serviceType': 'ClusterIP' } }) for txn in config.txn(): txn.create_deployment(deploy) try: # Wait for Dask to become available. At some point there will be a # way to learn about availability from the configuration database # (clearly populated by controllers querying Helm/Kubernetes). So # for the moment we'll simply query the DNS name where we know # that Dask must become available eventually log.info("Waiting for Dask...")
def _deploy(self, deploy_name, n_workers, func, f_args): """ Make the deployment and execute the function. This is called from the thread. :param deploy_name: deployment name :param func: function to process :param f_args: function arguments :param n_workers: number of dask workers """ LOG.info("Deploying Dask...") self._deploy_id = "proc-{}-{}".format(self._pb_id, deploy_name) LOG.info(self._deploy_id) # Set Deployment to RUNNING status in the config_db self.update_deploy_status("RUNNING") # Hack for mismatch between formats of dask/distributed package version # Getting image from config db through the pb type, id and version wf_image = None for txn in self._config.txn(): pb = txn.get_processing_block(self._pb_id) wf_image = txn.get_workflow(pb.workflow["type"], pb.workflow["id"], pb.workflow["version"]) values = {"worker.replicas": n_workers} if wf_image is not None: values.update(wf_image) deploy = ska_sdp_config.Deployment( self._deploy_id, "helm", { "chart": "dask", "values": values }, ) for txn in self._config.txn(): txn.create_deployment(deploy) LOG.info("Waiting for Dask...") client = None for _ in range(200): try: client = distributed.Client(self._deploy_id + "-scheduler." + os.environ["SDP_HELM_NAMESPACE"] + ":8786") except Exception as ex: LOG.error(ex) if client is None: LOG.error("Could not connect to Dask!") sys.exit(1) LOG.info("Connected to Dask") # Computing result result = func(*f_args) compute_result = result.compute() LOG.info("Computed Result %s", compute_result) # Update Deployment Status self.update_deploy_status("FINISHED")
def make_deployment(dpl_name, dpl_args, pb_id): """Make a deployment given PB parameters.""" return ska_sdp_config.Deployment(pb_id + "-" + dpl_name, **dpl_args)
def main(argv): pb_id = argv[0] # Note that this process "claims" the workfow with a lease. This # means that once a processing block has been claimed, this script # must check in with the configuration database every ~10 seconds # or will be declared dead (and presumably restarted). This # obviously means that no serious work should actually happen here. for txn in config.txn(): txn.take_processing_block(pb_id, config.client_lease) pb = txn.get_processing_block(pb_id) # Show log.info("Claimed processing block %s", pb) # Deploy Dask with 2 workers. # This is done by adding the request to the configuration database, # where it will be picked up and executed by appropriate # controllers. In the full system this will involve external checks # for whether the workflow actually has been assigned enough resources # to do this - and for obtaining such assignments the workflow would # need to communicate with a scheduler process. But we are ignoring # all of that at the moment. log.info("Deploying Dask...") deploy_id = pb.pb_id + "-dask" deploy = ska_sdp_config.Deployment( deploy_id, "helm", { 'chart': 'stable/dask', 'values': { 'jupyter.enabled': 'false', 'worker.replicas': 2, # We want to access Dask in-cluster using a DNS name 'scheduler.serviceType': 'ClusterIP' } }) for txn in config.txn(): txn.create_deployment(deploy) try: # Wait for Dask to become available. At some point there will be a # way to learn about availability from the configuration database # (clearly populated by controllers querying Helm/Kubernetes). So # for the moment we'll simply query the DNS name where we know # that Dask must become available eventually log.info("Waiting for Dask...") client = None for _ in range(200): try: client = distributed.Client(deploy_id + '-scheduler.' + os.environ['SDP_HELM_NAMESPACE'] + ':8786') except Exception as e: print(e) if client is None: log.error("Could not connect to Dask!") exit(1) log.info("Connected to Dask") # Now we can use Dask to do some calculations. Let's use a silly # example from the documentation. def inc(x): return x + 1 L = client.map(inc, range(1000)) log.info("Dask results: {}".format(client.gather(L))) # Just idle until processing block or we lose ownership log.info("Done, now idling...") for txn in config.txn(): if not txn.is_processing_block_owner(pb.pb_id): break txn.loop(True) finally: # Clean up Dask deployment. This should also become semi-optional # eventually, as clearly the processing controller should learn to # free all deploymts associated with a workflow if it terminates # for whatever reason. for txn in config.txn(): txn.delete_deployment(deploy) config.close()
log.info("Waiting for processing block...") for txn in config.txn(): pb = txn.take_processing_block_by_workflow( workflow, config.client_lease) if pb is not None: continue txn.loop(wait=True) # Show log.info("Claimed processing block %s", pb) # Deploy Vis Receive with 1 worker. log.info("Deploying Vis Receive...") deploy_id = pb.pb_id + "-vis-receive" deploy = ska_sdp_config.Deployment( deploy_id, "helm", { 'chart': 'vis-receive', # Helm chart deploy/charts/vis-receive }) for txn in config.txn(): txn.create_deployment(deploy) try: # Just idle until processing block or we lose ownership log.info("Done, now idling...") for txn in config.txn(): if not txn.is_processing_block_owner(pb.pb_id): break txn.loop(True) finally: # Clean up vis receive deployment.