def create_cluster(self, cluster): ctx = context.ctx() self._update_rollback_strategy(cluster, shutdown=True) # create all instances cluster = g.change_cluster_status(cluster, "Spawning") self._create_instances(cluster) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, "Waiting") instances = g.get_instances(cluster) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach_to_instances(g.get_instances(cluster)) # prepare all instances cluster = g.change_cluster_status(cluster, "Preparing") self._configure_instances(cluster) self._update_rollback_strategy(cluster)
def _launch_instances(self, cluster, target_count, stages, update_stack=False, disable_rollback=True): # create all instances cluster = g.change_cluster_status(cluster, stages[0]) inst_ids = self._create_instances(cluster, target_count, update_stack, disable_rollback) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, stages[1]) instances = g.get_instances(cluster, inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = g.change_cluster_status(cluster, stages[2]) instances = g.get_instances(cluster, inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster) return inst_ids
def test_get_instances(self): cluster = self._make_sample() ctx = context.ctx() idx = 0 ids = [] for ng in cluster.node_groups: for i in range(ng.count): idx += 1 ids.append(self.api.instance_add(context.ctx(), ng, { 'instance_id': str(idx), 'instance_name': str(idx), })) cluster = self.api.cluster_get(ctx, cluster) instances = general.get_instances(cluster, ids) ids = set() for inst in instances: ids.add(inst.instance_id) self.assertEqual(idx, len(ids)) for i in range(1, idx): self.assertIn(str(i), ids) instances = general.get_instances(cluster) ids = set() for inst in instances: ids.add(inst.instance_id) self.assertEqual(idx, len(ids)) for i in range(1, idx): self.assertIn(str(i), ids)
def launch_instances(self, ctx, cluster, target_count): # create all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[0]}) LOG.info(g.format_cluster_status(cluster)) tmpl = heat.ClusterTemplate(cluster) self._configure_template(ctx, tmpl, cluster, target_count) stack = tmpl.instantiate(update_existing=self.UPDATE_STACK) stack.wait_till_active() self.inst_ids = self._populate_cluster(ctx, cluster, stack) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[1]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[2]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() cluster = g.change_cluster_status(cluster, "Scaling") instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) self._update_rollback_strategy(cluster, instance_ids=instance_ids) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances(g.get_instances(cluster, instance_ids)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) self._update_rollback_strategy(cluster) return instance_ids
def launch_instances(self, cluster, target_count): # create all instances cluster = g.change_cluster_status(cluster, self.STAGES[0]) tmpl = heat.ClusterTemplate(cluster) self._configure_template(tmpl, cluster, target_count) stack = tmpl.instantiate(update_existing=self.UPDATE_STACK, disable_rollback=self.DISABLE_ROLLBACK) heat.wait_stack_completion(stack.heat_stack) self.inst_ids = self._populate_cluster(cluster, stack) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, self.STAGES[1]) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = g.change_cluster_status(cluster, self.STAGES[2]) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() cluster = g.change_cluster_status(cluster, "Scaling") instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) self._update_rollback_strategy(cluster, instance_ids=instance_ids) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) self._update_rollback_strategy(cluster) return instance_ids
def launch_instances(self, ctx, cluster, target_count): # create all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[0]}) LOG.info(g.format_cluster_status(cluster)) tmpl = heat.ClusterTemplate(cluster) self._configure_template(ctx, tmpl, cluster, target_count) stack = tmpl.instantiate(update_existing=self.UPDATE_STACK) stack.wait_till_active() self.inst_ids = self._populate_cluster(ctx, cluster, stack) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[1]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": self.STAGES[2]}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return [] self._assign_floating_ips(instances) self._await_networks(cluster, instances) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return [] cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return [] self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def create_cluster(self, cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) self._create_instances(cluster) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster) self._await_active(cluster, instances) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return self._assign_floating_ips(instances) self._await_networks(cluster, instances) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach_to_instances(g.get_instances(cluster)) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) self._configure_instances(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, {"status": "Error", "status_description": str(ex)}) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster, ex)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info(_LI("All instances have IPs assigned")) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step(cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("All instances are accessible"))
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) # Decommissioning surplus nodes with the plugin cluster = g.change_cluster_status(cluster, "Decommissioning") instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = g.change_cluster_status(cluster, "Scaling") instance_ids = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instance_ids: cluster = g.change_cluster_status(cluster, "Configuring") instances = g.get_instances(cluster, instance_ids) plugin.scale_cluster(cluster, instances) g.change_cluster_status(cluster, "Active")
def test_attach(self, add_step, add_event, p_create_attach_vol, p_await, p_mount): p_create_attach_vol.side_effect = ['/dev/vdb', '/dev/vdc'] * 2 p_await.return_value = None p_mount.return_value = None add_event.return_value = None add_step.return_value = None instance1 = {'id': '1', 'instance_id': '123', 'instance_name': 'inst_1'} instance2 = {'id': '2', 'instance_id': '456', 'instance_name': 'inst_2'} ng = {'volumes_per_node': 2, 'volumes_size': 2, 'volumes_availability_zone': None, 'volume_mount_prefix': '/mnt/vols', 'volume_type': None, 'name': 'master', 'cluster_id': '11', 'instances': [instance1, instance2], 'volume_local_to_instance': False} cluster = r.ClusterResource({'node_groups': [ng]}) volumes.attach_to_instances(g.get_instances(cluster)) self.assertEqual(4, p_create_attach_vol.call_count) self.assertEqual(2, p_await.call_count) self.assertEqual(4, p_mount.call_count)
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) # Decommissioning surplus nodes with the plugin cluster = g.change_cluster_status(cluster, "Decommissioning") instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count:node_group. count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = g.change_cluster_status(cluster, "Scaling") instance_ids = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instance_ids: cluster = g.change_cluster_status(cluster, "Configuring") instances = g.get_instances(cluster, instance_ids) plugin.scale_cluster(cluster, instances) g.change_cluster_status(cluster, "Active")
def test_attach(self, p_create_attach_vol, p_await, p_mount): p_create_attach_vol.side_effect = ['/dev/vdb', '/dev/vdc'] * 2 p_await.return_value = None p_mount.return_value = None instance1 = {'id': '1', 'instance_id': '123', 'instance_name': 'inst_1'} instance2 = {'id': '2', 'instance_id': '456', 'instance_name': 'inst_2'} ng = {'volumes_per_node': 2, 'volumes_size': 2, 'volume_mount_prefix': '/mnt/vols', 'name': 'master', 'instances': [instance1, instance2]} cluster = r.ClusterResource({'node_groups': [ng]}) volumes.attach_to_instances(g.get_instances(cluster)) self.assertEqual(p_create_attach_vol.call_count, 4) self.assertEqual(p_await.call_count, 2) self.assertEqual(p_mount.call_count, 4)
def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info( _LI("Cluster '%s': all instances have IPs assigned"), cluster.id) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info( _LI("Cluster {cluster_id}: all instances have IPs assigned") .format(cluster_id=cluster.id)) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step( cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster {cluster_id}: all instances are accessible") .format(cluster_id=cluster.id))
def _add_volumes(self, ctx, cluster): for instance in g.get_instances(cluster): res_names = heat.client().resources.get( cluster.name, instance.instance_name).required_by for res_name in res_names: vol_res = heat.client().resources.get(cluster.name, res_name) if vol_res.resource_type == (('OS::Cinder::' 'VolumeAttachment')): volume_id = vol_res.physical_resource_id conductor.append_volume(ctx, instance, volume_id)
def launch_instances(self, cluster, target_count): # create all instances cluster = g.change_cluster_status(cluster, self.STAGES[0]) self.create_instances(cluster, target_count) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, self.STAGES[1]) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = g.change_cluster_status(cluster, self.STAGES[2]) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def configure_ntp(cluster_id): cluster = conductor.cluster_get(context.ctx(), cluster_id) if not is_ntp_enabled(cluster): LOG.debug("Don't configure NTP on cluster") return instances = g.get_instances(cluster) url = retrieve_ntp_server_url(cluster) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("configure-ntp-%s" % instance.instance_name, _configure_ntp_on_instance, instance, url)
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) # Decommissioning surplus nodes with the plugin cluster = conductor.cluster_update(ctx, cluster, {"status": "Decommissioning"}) LOG.info(g.format_cluster_status(cluster)) instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = conductor.cluster_update(ctx, cluster, {"status": "Scaling"}) LOG.info(g.format_cluster_status(cluster)) instances = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instances: cluster = conductor.cluster_update(ctx, cluster, {"status": "Configuring"}) LOG.info(g.format_cluster_status(cluster)) try: instances = g.get_instances(cluster, instances) plugin.scale_cluster(cluster, instances) except Exception as ex: if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return LOG.exception("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) return if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster))
def shutdown_cluster(self, cluster): """Shutdown specified cluster and all related resources.""" try: heat.client().stacks.delete(cluster.name) except heat_exc.HTTPNotFound: LOG.warn("Did not found stack for cluster %s" % cluster.name) self._clean_job_executions(cluster) ctx = context.ctx() instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)
def shutdown_cluster(self, cluster): """Shutdown specified cluster and all related resources.""" try: heat.client().stacks.delete(cluster.name) except heat_exc.HTTPNotFound: LOG.warn('Did not found stack for cluster %s' % cluster.name) self._clean_job_executions(cluster) ctx = context.ctx() instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)
def launch_instances(self, cluster, target_count): # create all instances cluster = g.change_cluster_status(cluster, self.STAGES[0]) cpo.add_provisioning_step(cluster.id, _("Create Heat stack"), 1) with context.InstanceInfoManager([cluster.id, None, None, None]): self.create_instances(cluster, target_count) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, self.STAGES[1]) instances = g.get_instances(cluster, self.inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = g.change_cluster_status(cluster, self.STAGES[2]) instances = g.get_instances(cluster, self.inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster)
def _get_instance_if_running(self, job_execution): pid, inst_id = self._get_pid_and_inst_id(job_execution.oozie_job_id) if not pid or not inst_id or ( job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED): return None, None # TODO(tmckay): well, if there is a list index out of range # error here it probably means that the instance is gone. If we # have a job execution that is not terminated, and the instance # is gone, we should probably change the status somehow. # For now, do nothing. try: instance = general.get_instances(self.cluster, [inst_id])[0] except Exception: instance = None return pid, instance
def _get_instance_if_running(self, job_execution): pid, inst_id = self._get_pid_and_inst_id(job_execution.oozie_job_id) if not pid or not inst_id or (job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED): return None, None # TODO(tmckay): well, if there is a list index out of range # error here it probably means that the instance is gone. If we # have a job execution that is not terminated, and the instance # is gone, we should probably change the status somehow. # For now, do nothing. try: instance = general.get_instances(self.cluster, [inst_id])[0] except Exception: instance = None return pid, instance
def _populate_cluster(self, cluster, stack): ctx = context.ctx() old_ids = [i.instance_id for i in g.get_instances(cluster)] new_ids = [] for node_group in cluster.node_groups: nova_ids = stack.get_node_group_instances(node_group) for name, nova_id in nova_ids: if nova_id not in old_ids: instance_id = conductor.instance_add( ctx, node_group, {"instance_id": nova_id, "instance_name": name}) new_ids.append(instance_id) return new_ids
def _launch_instances(self, cluster, target_count, stages, update_stack=False, disable_rollback=True): # create all instances cluster = g.change_cluster_status(cluster, stages[0]) inst_ids = self._create_instances( cluster, target_count, update_stack, disable_rollback) # wait for all instances are up and networks ready cluster = g.change_cluster_status(cluster, stages[1]) instances = g.get_instances(cluster, inst_ids) self._await_networks(cluster, instances) # prepare all instances cluster = g.change_cluster_status(cluster, stages[2]) instances = g.get_instances(cluster, inst_ids) volumes.mount_to_instances(instances) self._configure_instances(cluster) return inst_ids
def _populate_cluster(self, ctx, cluster, stack): old_ids = [i.instance_id for i in g.get_instances(cluster)] new_ids = [] for node_group in cluster.node_groups: nova_ids = stack.get_node_group_instances(node_group) for name, nova_id in nova_ids: if nova_id not in old_ids: instance_id = conductor.instance_add( ctx, node_group, {"instance_id": nova_id, "instance_name": name} ) new_ids.append(instance_id) return new_ids
def decommission_nodes(cluster, instances, configure_sh_string): LOG.info(_LI('Start decommission . Cluster = %s'), cluster.name) move_node(cluster, instances) stop_services(cluster, instances) context.sleep(names.WAIT_NODE_ALARM_NO_HEARTBEAT) remove_node(cluster, instances) remove_services(cluster, instances) if check_for_cldb_or_zookeeper_service(instances): all_instances = gen.get_instances(cluster) current_cluster_instances = [ x for x in all_instances if x not in instances] for inst in current_cluster_instances: start_helper.exec_configure_sh_on_instance( cluster, inst, configure_sh_string) LOG.info(_LI('End decommission. Cluster = %s'), cluster.name)
def shutdown_cluster(self, cluster): """Shutdown specified cluster and all related resources.""" try: heat.client().stacks.delete(cluster.name) stack = heat.get_stack(cluster.name) heat.wait_stack_completion(stack) except heat_exc.HTTPNotFound: LOG.warning(_LW('Did not found stack for cluster {cluster_name}') .format(cluster_name=cluster.name)) self._clean_job_executions(cluster) ctx = context.ctx() instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)
def rollback_cluster(self, cluster, reason): rollback_info = cluster.rollback_info or {} self._update_rollback_strategy(cluster) if rollback_info.get('shutdown', False): self._rollback_cluster_creation(cluster, reason) return False instance_ids = rollback_info.get('instance_ids', []) if instance_ids: self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), reason) return True return False
def _provision_scaled_cluster(id, node_group_id_map): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) # Decommissioning surplus nodes with the plugin cluster = conductor.cluster_update(ctx, cluster, {"status": "Decommissioning"}) LOG.info(g.format_cluster_status(cluster)) instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count:node_group. count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = conductor.cluster_update(ctx, cluster, {"status": "Scaling"}) LOG.info(g.format_cluster_status(cluster)) instances = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instances: cluster = conductor.cluster_update(ctx, cluster, {"status": "Configuring"}) LOG.info(g.format_cluster_status(cluster)) try: instances = g.get_instances(cluster, instances) plugin.scale_cluster(cluster, instances) except Exception as ex: LOG.exception("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) return cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster))
def _populate_cluster(self, cluster, stack): ctx = context.ctx() old_ids = [i.instance_id for i in g.get_instances(cluster)] new_ids = [] for node_group in cluster.node_groups: instances = stack.get_node_group_instances(node_group) for instance in instances: nova_id = instance['physical_id'] name = instance['name'] if nova_id not in old_ids: instance_id = conductor.instance_add( ctx, node_group, {"instance_id": nova_id, "instance_name": name}) new_ids.append(instance_id) return new_ids
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) # Decommissioning surplus nodes with the plugin cluster = g.change_cluster_status(cluster, "Decommissioning") instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if instances_to_delete: plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = g.change_cluster_status(cluster, "Scaling") instances = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instances: cluster = g.change_cluster_status(cluster, "Configuring") try: instances = g.get_instances(cluster, instances) plugin.scale_cluster(cluster, instances) except Exception as ex: if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return LOG.exception( _LE("Can't scale cluster '%(name)s' (reason: %(reason)s)"), {'name': cluster.name, 'reason': ex}) g.change_cluster_status(cluster, "Error") return if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return g.change_cluster_status(cluster, "Active")
def test_delete_floating_ips(self): node_groups = [_make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, 'pool')] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["datanode"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) self.engine._shutdown_instances(cluster) self.assertEqual(self.nova.floating_ips.delete.call_count, 2, "Not expected floating IPs number found in delete") self.assertEqual(self.nova.servers.delete.call_count, 2, "Not expected")
def create_cluster(self, cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) self._create_instances(cluster) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) self._configure_instances(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, { "status": "Error", "status_description": str(ex) }) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster, ex)
def rollback_cluster(self, cluster, reason): rollback_info = cluster.rollback_info or {} self._update_rollback_strategy(cluster) if rollback_info.get('shutdown', False): self._rollback_cluster_creation(cluster, reason) LOG.warning(_LW("Cluster creation rollback " "(reason: {reason})").format(reason=reason)) return False instance_ids = rollback_info.get('instance_ids', []) if instance_ids: self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), reason) LOG.warning(_LW("Cluster scaling rollback " "(reason: {reason})").format(reason=reason)) return True return False
def rollback_cluster(self, cluster, reason): rollback_info = cluster.rollback_info or {} self._update_rollback_strategy(cluster) if rollback_info.get('shutdown', False): self._rollback_cluster_creation(cluster, reason) LOG.warning( _LW("Cluster creation rollback " "(reason: {reason})").format(reason=reason)) return False instance_ids = rollback_info.get('instance_ids', []) if instance_ids: self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), reason) LOG.warning( _LW("Cluster scaling rollback " "(reason: {reason})").format(reason=reason)) return True return False
def test_ip_assignment_use_no_floating(self): self.override_config("use_floating_ips", False) node_groups = [_make_ng_dict("test_group_1", "test_flavor", ["data node", "test tracker"], 2, 'pool'), _make_ng_dict("test_group_2", "test_flavor", ["name node", "test tracker"], 1)] ctx = context.ctx() cluster = _create_cluster_mock(node_groups, ["data node"]) self.engine._create_instances(cluster) cluster = conductor.cluster_get(ctx, cluster) instances_list = g.get_instances(cluster) self.engine._assign_floating_ips(instances_list) self.nova.floating_ips.create.assert_has_calls( [mock.call("pool"), mock.call("pool")]) self.assertEqual(self.nova.floating_ips.create.call_count, 2, "Not expected floating IPs number found.")
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) try: # Decommissioning surplus nodes with the plugin cluster = g.change_cluster_status(cluster, "Decommissioning") instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[new_count: node_group.count] if instances_to_delete: context.set_step_type(_("Plugin: decommission cluster")) plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = g.change_cluster_status(cluster, "Scaling") context.set_step_type(_("Engine: scale cluster")) instance_ids = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instance_ids: ntp_service.configure_ntp(cluster_id) cluster = g.change_cluster_status(cluster, "Configuring") instances = g.get_instances(cluster, instance_ids) context.set_step_type(_("Plugin: scale cluster")) plugin.scale_cluster(cluster, instances) g.change_cluster_status(cluster, "Active") finally: if CONF.use_identity_api_v3 and not cluster.is_transient: trusts.delete_trust_from_cluster(cluster)
def _provision_scaled_cluster(cluster_id, node_group_id_map): ctx, cluster, plugin = _prepare_provisioning(cluster_id) try: # Decommissioning surplus nodes with the plugin cluster = g.change_cluster_status(cluster, "Decommissioning") instances_to_delete = [] for node_group in cluster.node_groups: new_count = node_group_id_map[node_group.id] if new_count < node_group.count: instances_to_delete += node_group.instances[ new_count:node_group.count] if instances_to_delete: context.set_step_type(_("Plugin: decommission cluster")) plugin.decommission_nodes(cluster, instances_to_delete) # Scaling infrastructure cluster = g.change_cluster_status(cluster, "Scaling") context.set_step_type(_("Engine: scale cluster")) instance_ids = INFRA.scale_cluster(cluster, node_group_id_map) # Setting up new nodes with the plugin if instance_ids: ntp_service.configure_ntp(cluster_id) cluster = g.change_cluster_status(cluster, "Configuring") instances = g.get_instances(cluster, instance_ids) context.set_step_type(_("Plugin: scale cluster")) plugin.scale_cluster(cluster, instances) g.change_cluster_status(cluster, "Active") finally: if CONF.use_identity_api_v3 and not cluster.is_transient: trusts.delete_trust_from_cluster(cluster)
def _remove_db_objects(self, cluster): ctx = context.ctx() cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster) for inst in instances: conductor.instance_remove(ctx, inst)