def start(self): super().start() if not self.kwargs.get("name"): self.network_view = deployer.select_network(self) else: self.network_view = deployer.get_network_view(self.kwargs["name"]) if not self.network_view: raise StopChatFlow(f"no network named {self.kwargs['name']}")
def start_vmachine_deployment( self, farm_name, solution_name, query, vm_size, ssh_keys, enable_public_ip=False, solution_uuid=None, vmachine_type=None, duration=None, ): """ search for a pool in the same farm and extend it or create a new one with the required capacity """ old_node_ids = [] for k8s_node in self.vdc_instance.kubernetes: old_node_ids.append(k8s_node.node_id) for vmachine in self.vdc_instance.vmachines: old_node_ids.append(vmachine.node_id) cc = CapacityChecker(farm_name) cc.exclude_nodes(*old_node_ids) if not cc.add_query(**query): raise j.exceptions.Validation(f"Not enough capacity in farm {farm_name} for deploying vmachine") duration = ( duration if duration else self.vdc_instance.expiration_date.timestamp() - j.data.time.utcnow().timestamp ) if duration <= 0: raise j.exceptions.Validation(f"invalid duration {duration}") scheduler = Scheduler(farm_name=farm_name) scheduler.exclude_nodes(*old_node_ids) nodes_generator = scheduler.nodes_by_capacity(**query, public_ip=enable_public_ip) pool_id = self._preprare_extension_pool(farm_name, vm_size, duration, enable_public_ip) network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name) vm_res = self.deploy_vmachine( solution_name, vm_size, pool_id, nodes_generator, ssh_keys, solution_uuid, network_view, enable_public_ip, vmachine_type, description=self.vdc_deployer.description, ) if not vm_res: self.vdc_deployer.error(f"Failed to deploy vmachine") raise j.exceptions.Runtime(f"Failed to deploy vmachine") return vm_res
def redeploy_master(self, old_master_workload=None): nv = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name) if not old_master_workload: old_master_workload = self._get_latest_master_workload() if not old_master_workload: self.vdc_deployer.error("Couldn't find old master workload") return # delete old master in case of next action is deploy if old_master_workload.info.next_action == NextAction.DEPLOY: self.vdc_deployer.info(f"Deleting old master workload {old_master_workload.id}") self.zos.workloads.decomission(old_master_workload.id) deployer.wait_workload_deletion(old_master_workload.id) # deleting network old network # old_network_workload = None # workloads = self.zos.workloads.list_workloads(self.vdc_instance.identity_tid) # for workload in workloads: # if workload.info.workload_type == WorkloadType.Network_resource and self._ip_in_network( # old_master_workload.ipaddress, workload.iprange # ): # old_network_workload = workload # break # self.vdc_deployer.info(f"Deleting old network on node {old_network_workload.info.node_id}") # nv.delete_node(old_network_workload.info.node_id) master_size = VDC_SIZE.VDC_FLAVORS[self.vdc_deployer.flavor]["k8s"]["controller_size"] pub_keys = [self.vdc_deployer.ssh_key.public_key.strip()] gs = GlobalScheduler() # need to used nodes self.vdc_instance.load_info() endpoints = ",".join([f"http://{etcd.ip_address}:2379" for etcd in self.vdc_instance.etcd]) self.vdc_deployer.info("Deploying new master") public_ip_workload = self.zos.workloads.get(old_master_workload.public_ip) self.deploy_master( old_master_workload.info.pool_id, gs, master_size, self.vdc_instance.get_password(), pub_keys, self.vdc_uuid, nv, endpoints, public_ip=public_ip_workload.ipaddress, ) return True
def reservation(self): metadata = { "name": self.domain, "form_info": { "Solution name": self.domain, "chatflow": "exposed" } } self.solution_metadata.update(metadata) query = {"mru": 1, "cru": 1, "sru": 1} self.selected_node = deployer.schedule_container(self.pool_id, **query) self.network_name = self.solution["Network"] result = deployer.add_network_node( self.network_name, self.selected_node, self.pool_id, bot=self, owner=self.solution_metadata.get("owner")) if result: for wid in result["ids"]: success = deployer.wait_workload( wid, self, breaking_node_id=self.selected_node.node_id) if not success: raise DeploymentFailed( f"Failed to add node to network {wid}", wid=wid) self.network_view = deployer.get_network_view(self.network_name) self.tcprouter_ip = self.network_view.get_free_ip(self.selected_node) if not self.tcprouter_ip: raise StopChatFlow( f"No available ips one for network {self.network_view.name} node {self.selected_node.node_id}" ) if self.domain_type != "Custom Domain": self.dom_id = deployer.create_subdomain( pool_id=self.domain_pool.pool_id, gateway_id=self.domain_gateway.node_id, subdomain=self.domain, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.dom_id, self) if not success: raise DeploymentFailed( f"Failed to reserve sub-domain workload {self.dom_id}", solution_uuid=self.solution_id) self.proxy_id = deployer.create_proxy( pool_id=self.domain_pool.pool_id, gateway_id=self.domain_gateway.node_id, domain_name=self.domain, trc_secret=self.secret, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.proxy_id, self) if not success: raise DeploymentFailed( f"Failed to reserve reverse proxy workload {self.proxy_id}", solution_uuid=self.solution_id) self.tcprouter_id = deployer.expose_address( pool_id=self.pool_id, gateway_id=self.domain_gateway.node_id, network_name=self.network_name, local_ip=self.solution_ip, port=self.port, tls_port=self.tls_port, trc_secret=self.secret, bot=self, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.tcprouter_id, self) if not success: raise DeploymentFailed( f"Failed to reserve TCP Router container workload {self.tcprouter_id}", solution_uuid=self.solution_id, wid=self.tcprouter_id, )
def reservation(self): metadata = { "name": self.domain, "form_info": { "Solution name": self.domain, "chatflow": "exposed" } } self.solution_metadata.update(metadata) query = {"mru": 1, "cru": 1, "sru": 1} self.selected_node = deployer.schedule_container(self.pool_id, **query) self.network_name = self.solution["Network"] result = deployer.add_network_node( self.network_name, self.selected_node, self.pool_id, bot=self, owner=self.solution_metadata.get("owner")) if result: for wid in result["ids"]: success = deployer.wait_workload( wid, self, breaking_node_id=self.selected_node.node_id) if not success: raise DeploymentFailed( f"Failed to add node to network {wid}", wid=wid) self.network_view = deployer.get_network_view(self.network_name) self.tcprouter_ip = self.network_view.get_free_ip(self.selected_node) if not self.tcprouter_ip: raise StopChatFlow( f"No available ips one for network {self.network_view.name} node {self.selected_node.node_id}" ) if self.domain_type != "Custom Domain": self.dom_id = deployer.create_subdomain( pool_id=self.domain_pool.pool_id, gateway_id=self.domain_gateway.node_id, subdomain=self.domain, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.dom_id, self) if not success: raise DeploymentFailed( f"Failed to reserve sub-domain workload {self.dom_id}", solution_uuid=self.solution_id) if self.proxy_type == "TRC": self.proxy_id = deployer.create_proxy( pool_id=self.domain_pool.pool_id, gateway_id=self.domain_gateway.node_id, domain_name=self.domain, trc_secret=self.secret, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.proxy_id, self) if not success: raise DeploymentFailed( f"Failed to reserve reverse proxy workload {self.proxy_id}", solution_uuid=self.solution_id) trc_log_config = j.core.config.get("LOGGING_SINK", {}) if trc_log_config: trc_log_config[ "channel_name"] = f"{self.threebot_name}-{self.solution_name}-trc".lower( ) if self.proxy_type == "NGINX": self.tcprouter_id = deployer.expose_and_create_certificate( domain=self.domain, email=self.email, pool_id=self.pool_id, gateway_id=self.domain_gateway.node_id, network_name=self.network_name, solution_ip=self.solution_ip, solution_port=self.port, trc_secret=self.secret, bot=self, enforce_https=self.force_https, log_config=trc_log_config, **self.solution_metadata, solution_uuid=self.solution_id, ) else: self.tcprouter_id, _ = deployer.expose_address( pool_id=self.pool_id, gateway_id=self.domain_gateway.node_id, network_name=self.network_name, local_ip=self.solution_ip, port=self.port, tls_port=self.tls_port, trc_secret=self.secret, bot=self, log_config=trc_log_config, **self.solution_metadata, solution_uuid=self.solution_id, ) success = deployer.wait_workload(self.tcprouter_id, self) if not success: raise DeploymentFailed( f"Failed to reserve TCP Router container workload {self.tcprouter_id}", solution_uuid=self.solution_id, wid=self.tcprouter_id, )
def deploy_s3_minio_container(self, pool_id, ak, sk, ssh_key, scheduler, zdb_wids, solution_uuid, password): zdb_configs = [] self.vdc_deployer.info(f"deploying minio for zdbs: {zdb_wids}") for zid in zdb_wids: zdb_configs.append( deployer.get_zdb_url( zid, password, identity_name=self.identity.instance_name)) self.vdc_deployer.info(f"zdb_configs: {zdb_configs}") network_view = deployer.get_network_view( self.vdc_name, identity_name=self.identity.instance_name) for node in scheduler.nodes_by_capacity(cru=MINIO_CPU, mru=MINIO_MEMORY / 1024, sru=MINIO_DISK / 1024, ip_version="IPv6"): self.vdc_deployer.info(f"node {node.node_id} selected for minio") try: result = deployer.add_network_node(self.vdc_name, node, pool_id, network_view, self.bot, self.identity.instance_name) if result: for wid in result["ids"]: success = deployer.wait_workload( wid, self.bot, 5, identity_name=self.identity.instance_name, cancel_by_uuid=False) if not success: self.vdc_deployer.error( f"workload {wid} failed when adding node to network" ) raise DeploymentFailed() except DeploymentFailed: self.vdc_deployer.error( f"failed to deploy minio network on node {node.node_id}.") continue network_view = network_view.copy() ip_address = network_view.get_free_ip(node) self.vdc_deployer.info(f"minio ip address {ip_address}") try: result = deployer.deploy_minio_containers( pool_id, self.vdc_name, [node.node_id], [ip_address], zdb_configs, ak, sk, ssh_key, MINIO_CPU, MINIO_MEMORY, S3_NO_DATA_NODES, S3_NO_PARITY_NODES, public_ipv6=True, disk_size=int(MINIO_DISK / 1024), bot=self.bot, identity_name=self.identity.instance_name, # form_info={"chatflow": "minio"}, # name=self.vdc_name, solution_uuid=solution_uuid, description=self.vdc_deployer.description, ) except DeploymentFailed as e: if e.wid: workload = self.zos.workloads.get(e.wid) self.vdc_deployer.error( f"failed to deploy minio volume wid: {e.wid} on node {workload.info.node_id}" ) else: self.vdc_deployer.error( f"failed to deploy minio volume due to error {str(e)}") continue wid = result[0] try: success = deployer.wait_workload( wid, self.bot, identity_name=self.identity.instance_name, cancel_by_uuid=False) if not success: raise DeploymentFailed() self.vdc_deployer.info( f"minio container deployed successfully wid: {wid}") return wid except DeploymentFailed: self.vdc_deployer.error( f"failed to deploy minio container wid: {wid}") continue self.vdc_deployer.error("no nodes available to deploy minio container")
def deploy_external_etcd(self, farm_name, no_nodes=ETCD_CLUSTER_SIZE, solution_uuid=None): network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name) pool_id, _ = self.vdc_deployer.get_pool_id_and_reservation_id(farm_name) scheduler = Scheduler(pool_id=pool_id) nodes_generator = scheduler.nodes_by_capacity(cru=ETCD_CPU, sru=ETCD_DISK / 1024, mru=ETCD_MEMORY / 1024) solution_uuid = solution_uuid or uuid.uuid4().hex while True: deployment_nodes = self._add_nodes_to_network(pool_id, nodes_generator, [], no_nodes, network_view) if not deployment_nodes: self.vdc_deployer.error("no available nodes to deploy etcd cluster") return self.vdc_deployer.info(f"deploying etcd cluster on nodes {[node.node_id for node in deployment_nodes]}") network_view = network_view.copy() ip_addresses = [] node_ids = [] etcd_cluster = "" for idx, node in enumerate(deployment_nodes): address = network_view.get_free_ip(node) ip_addresses.append(address) etcd_cluster += f"etcd_{idx+1}=http://{address}:2380," node_ids.append(node.node_id) secret_env = None # etcd_backup_config = j.core.config.get("VDC_S3_CONFIG", {}) # restic_url = etcd_backup_config.get("S3_URL", "") # restic_bucket = etcd_backup_config.get("S3_BUCKET", "") # restic_ak = etcd_backup_config.get("S3_AK", "") # restic_sk = etcd_backup_config.get("S3_SK", "") # if all([self.vdc_deployer.restore, restic_url, restic_bucket, restic_ak, restic_sk]): # secret_env = { # "RESTIC_REPOSITORY": f"s3:{restic_url}/{restic_bucket}/{self.vdc_instance.owner_tname}/{self.vdc_instance.vdc_name}", # "AWS_ACCESS_KEY_ID": restic_ak, # "AWS_SECRET_ACCESS_KEY": restic_sk, # "RESTIC_PASSWORD": self.vdc_deployer.password_hash, # } explorer = None if "test" in j.core.identity.me.explorer_url: explorer = "test" elif "dev" in j.core.identity.me.explorer_url: explorer = "dev" else: explorer = "main" log_config = j.core.config.get("VDC_LOG_CONFIG", {}) if log_config: log_config["channel_name"] = f"{self.vdc_instance.instance_name}_{explorer}" pool_ids = [pool_id for i in range(no_nodes)] wids = deployer.deploy_etcd_containers( pool_ids, node_ids, network_view.name, ip_addresses, etcd_cluster, ETCD_FLIST, ETCD_CPU, ETCD_MEMORY, ETCD_DISK, entrypoint="", ssh_key=self.vdc_deployer.ssh_key.public_key.strip(), identity_name=self.identity.instance_name, solution_uuid=solution_uuid, description=self.vdc_deployer.description, secret_env=secret_env, log_config=log_config, ) try: for wid in wids: success = deployer.wait_workload( wid, self.bot, identity_name=self.identity.instance_name, cancel_by_uuid=False ) if not success: self.vdc_deployer.error(f"etcd cluster workload: {wid} failed to deploy") raise DeploymentFailed() except DeploymentFailed: for wid in wids: self.zos.workloads.decomission(wid) continue return ip_addresses
def extend_cluster( self, farm_name, master_ip, k8s_flavor, cluster_secret, ssh_keys, no_nodes=1, duration=None, public_ip=False, solution_uuid=None, external=True, nodes_ids=None, no_extend_pool=False, ): """ search for a pool in the same farm and extend it or create a new one with the required capacity """ old_node_ids = [] for k8s_node in self.vdc_instance.kubernetes: old_node_ids.append(k8s_node.node_id) cc = CapacityChecker(farm_name) cc.exclude_nodes(*old_node_ids) for _ in range(no_nodes): if not cc.add_query(**VDC_SIZE.K8S_SIZES[k8s_flavor]): raise j.exceptions.Validation( f"Not enough capacity in farm {farm_name} for {no_nodes} kubernetes nodes of flavor {k8s_flavor}" ) duration = ( duration * 60 * 60 * 24 if duration else self.vdc_instance.expiration_date.timestamp() - j.data.time.utcnow().timestamp ) if duration <= 0: raise j.exceptions.Validation(f"invalid duration {duration}") scheduler = Scheduler(farm_name=farm_name) scheduler.exclude_nodes(*old_node_ids) nodes_generator = scheduler.nodes_by_capacity(**VDC_SIZE.K8S_SIZES[k8s_flavor], public_ip=public_ip) if nodes_ids: nodes_generator = list(nodes_generator) nodes_generator_ids = [node.node_id for node in nodes_generator] unavailable_nodes_ids = set(nodes_ids) - set(nodes_generator_ids) if unavailable_nodes_ids: raise j.exceptions.Validation( f"Some nodes: {unavailable_nodes_ids} are not in farm: {farm_name} or don't have capacity" ) nodes_generator = [node for node in nodes_generator if node.node_id in nodes_ids] pool_id = self._preprare_extension_pool( farm_name, k8s_flavor, no_nodes, duration, public_ip, no_extend=no_extend_pool ) network_view = deployer.get_network_view(self.vdc_name, identity_name=self.identity.instance_name) solution_uuid = solution_uuid or uuid.uuid4().hex wids = self._add_workers( pool_id, nodes_generator, k8s_flavor, cluster_secret, ssh_keys, solution_uuid, # use differnet uuid than network_view, master_ip, no_nodes, public_ip, external, ) if not wids: self.vdc_deployer.error( f"Failed to extend kubernetes cluster with {no_nodes} nodes of flavor {k8s_flavor}, vdc uuid {self.vdc_uuid}" ) j.sals.reservation_chatflow.solutions.cancel_solution_by_uuid(solution_uuid) raise j.exceptions.Runtime( f"failed to extend kubernetes cluster with {no_nodes} nodes of flavor {k8s_flavor}, vdc uuid {self.vdc_uuid}" ) return wids
def deploy_threebot(self, minio_wid, pool_id, kube_config, embed_trc=True, backup_config=None, zdb_farms=None, cert=None): backup_config = backup_config or {} etcd_backup_config = j.core.config.get("VDC_S3_CONFIG", {}) flist = THREEBOT_VDC_FLIST if embed_trc else THREEBOT_FLIST # workload = self.zos.workloads.get(minio_wid) # if workload.info.workload_type != WorkloadType.Container: # raise j.exceptions.Validation(f"workload {minio_wid} is not container workload") # minio_ip_address = workload.network_connection[0].ipaddress vdc_dict = self.vdc_instance.to_dict() vdc_dict.pop("s3", None) vdc_dict.pop("kubernetes", None) vdc_dict.pop("threebot", None) secret_env = { "BACKUP_CONFIG": j.data.serializers.json.dumps(backup_config), "VDC_OWNER_TNAME": self.vdc_deployer.tname, "VDC_EMAIL": self.vdc_deployer.email, "VDC_PASSWORD_HASH": self.vdc_deployer.vdc_instance.get_password(), "KUBE_CONFIG": kube_config, "PROVISIONING_WALLET_SECRET": self.vdc_deployer.vdc_instance.provision_wallet.secret, "PREPAID_WALLET_SECRET": self.vdc_deployer.vdc_instance.prepaid_wallet.secret, "VDC_INSTANCE": j.data.serializers.json.dumps(vdc_dict), "THREEBOT_PRIVATE_KEY": self.vdc_deployer.ssh_key.private_key.strip(), "S3_URL": etcd_backup_config.get("S3_URL", ""), "S3_BUCKET": etcd_backup_config.get("S3_BUCKET", ""), "S3_AK": etcd_backup_config.get("S3_AK", ""), "S3_SK": etcd_backup_config.get("S3_SK", ""), } if cert: secret_env["CERT"] = cert.cert secret_env["CERT_PRIVATE_KEY"] = cert.private_key secret_env["CERT_FULLCHAIN"] = cert.fullchain env = { "VDC_NAME": self.vdc_name, "MONITORING_SERVER_URL": j.config.get("MONITORING_SERVER_URL", ""), "VDC_UUID": self.vdc_uuid, "EXPLORER_URL": j.core.identity.me.explorer_url, "VDC_S3_MAX_STORAGE": str( int(VDC_SIZE.S3_ZDB_SIZES[VDC_SIZE.VDC_FLAVORS[ self.vdc_deployer.flavor]["s3"]["size"]]["sru"] * (1 + (S3_NO_PARITY_NODES / (S3_NO_DATA_NODES + S3_NO_PARITY_NODES))))), "S3_AUTO_TOPUP_FARMS": ",".join(S3_AUTO_TOPUP_FARMS.get()) if not zdb_farms else ",".join(zdb_farms), "NETWORK_FARMS": ",".join(NETWORK_FARMS.get()), "COMPUTE_FARMS": ",".join(COMPUTE_FARMS.get()), # "VDC_MINIO_ADDRESS": minio_ip_address, "SDK_VERSION": self.branch, "SSHKEY": self.vdc_deployer.ssh_key.public_key.strip(), "MINIMAL": "true", "TEST_CERT": "true" if j.core.config.get("TEST_CERT") else "false", "ACME_SERVER_URL": self.acme_server_url, } if embed_trc: _, secret, remote = self._prepare_proxy() if not remote: return remote_ip, remote_port = remote.split(":") env.update({ "REMOTE_IP": remote_ip, "REMOTE_PORT": remote_port, }) secret_env["TRC_SECRET"] = secret if not self.vdc_instance.kubernetes: self.vdc_instance.load_info() scheduler = Scheduler(pool_id=pool_id) for node in scheduler.nodes_by_capacity(THREEBOT_CPU, THREEBOT_DISK / 1024, THREEBOT_MEMORY / 1024): network_view = deployer.get_network_view( self.vdc_name, identity_name=self.identity.instance_name) self.vdc_deployer.info( f"VDC threebot: node {node.node_id} selected") result = deployer.add_network_node(network_view.name, node, pool_id, network_view, self.bot, self.identity.instance_name) self.vdc_deployer.info( f"VDC threebot network update result for node {node.node_id} is {result}" ) if result: network_updated = True try: for wid in result["ids"]: success = deployer.wait_workload( wid, self.bot, expiry=5, breaking_node_id=node.node_id, identity_name=self.identity.instance_name, cancel_by_uuid=False, ) network_updated = network_updated and success if not network_updated: raise DeploymentFailed() except DeploymentFailed: self.vdc_deployer.error( f"Failed to deploy network on node {node.node_id}") continue network_view = network_view.copy() ip_address = network_view.get_free_ip(node) self.vdc_deployer.info( f"VDC threebot container ip address {ip_address}") if not ip_address: continue explorer = None if "test" in j.core.identity.me.explorer_url: explorer = "test" elif "dev" in j.core.identity.me.explorer_url: explorer = "dev" else: explorer = "main" log_config = j.core.config.get("VDC_LOG_CONFIG", {}) if log_config: log_config[ "channel_name"] = f"{self.vdc_instance.instance_name}_{explorer}" wid = deployer.deploy_container( pool_id=pool_id, node_id=node.node_id, network_name=network_view.name, ip_address=ip_address, flist=flist, env=env, cpu=THREEBOT_CPU, memory=THREEBOT_MEMORY, disk_size=THREEBOT_DISK, secret_env=secret_env, identity_name=self.identity.instance_name, description=self.vdc_deployer.description, form_info={ "chatflow": "threebot", "Solution name": self.vdc_name }, solution_uuid=self.vdc_uuid, log_config=log_config, ) self.vdc_deployer.info(f"VDC threebot container wid: {wid}") try: success = deployer.wait_workload( wid, self.bot, identity_name=self.identity.instance_name, cancel_by_uuid=False) if success: return wid raise DeploymentFailed() except DeploymentFailed: self.vdc_deployer.error( f"failed to deploy threebot container on node: {node.node_id} wid: {wid}" ) continue
def add_nodes(self): zos = j.sals.zos.get() workload = zos.workloads.get(self.master_wid) metadata = j.sals.reservation_chatflow.reservation_chatflow.decrypt_reservation_metadata( workload.info.metadata) metadata = j.data.serializers.json.loads(metadata) pool_id = workload.info.pool_id old_wids = j.sals.marketplace.solutions.get_workloads_by_uuid( metadata.get("solution_uuid")) old_nodes = [ wid.info.node_id for wid in old_wids if wid.info.result.state == State.Ok ] if self.enable_public_ip: self.node_query["ipv4u"] = self.nodes_count nodes, pools = deployer.ask_multi_pool_distribution( self, self.nodes_count + len(old_nodes), self.node_query) nodes_pools_zip = list(zip(nodes, pools)) selected_nodes = list( filter(lambda x: x[0].node_id not in old_nodes, nodes_pools_zip)) if len(selected_nodes) < self.nodes_count: self.stop( f"Failed to find resources to deploy {self.nodes_count}, available nodes are: {len(selected_nodes)}" ) new_nodes = selected_nodes[:self.nodes_count] network_view = deployer.get_network_view(workload.network_id) master_ip = workload.ipaddress self.reservations = [] for node, pool_id in new_nodes: res = deployer.add_network_node(workload.network_id, node, pool_id) if res: for wid in res["ids"]: success = deployer.wait_workload( wid, breaking_node_id=node.node_id) if not success: raise StopChatFlow( f"Failed to add node {node.node_id} to network {wid}" ) network_view = network_view.copy() ip_address = network_view.get_free_ip(node) if not ip_address: raise StopChatFlow( f"No free IPs for network {network_name} on the specifed node" f" {node_id}") self.md_show_update(f"Deploying worker on node {node.node_id}") # Add public ip public_id_wid = 0 if self.enable_public_ip: public_id_wid, _ = deployer.create_public_ip( pool_id, node.node_id, solution_uuid=metadata.get("solution_uuid")) self.reservations.append( deployer.deploy_kubernetes_worker( pool_id, node.node_id, workload.network_id, workload.cluster_secret, workload.ssh_keys, ip_address, master_ip, size=self.cluster_size, identity_name=None, description="", public_ip_wid=public_id_wid, **metadata, )) self.success_workload_count = 0 zos = j.sals.zos.get() for resv in self.reservations: try: success = deployer.wait_workload(resv, self, cancel_by_uuid=False) self.success_workload_count += 1 except DeploymentFailed as ex: # Cleaning k8s workloads and public IP workloads in case of failure in deployment workload = zos.workloads.get(resv) if workload.public_ip: zos.workloads.decomission(workload.public_ip) zos.workloads.decomission(wid) j.logger.error( f"Failed to deploy workloads for {resv}, the error: {str(ex)}" ) if not self.success_workload_count: raise StopChatFlow( msg="Can't extend your cluster, please try again later") if self.success_workload_count < len(self.reservations): raise StopChatFlow( msg= f"Some nodes failed to extend, {self.success_workload_count} of {self.nodes_count}, please try again later" )