def _grow_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id).all() cluster_head = next( Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in new_instance_ids) if not cluster_head: raise TroveError( _("Unable to determine existing Redis cluster" " member")) (cluster_head_ip, cluster_head_port) = (self.get_guest(cluster_head).get_node_ip()) # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") new_insts = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] new_guests = map(self.get_guest, new_insts) # Connect nodes to the cluster head for guest in new_guests: guest.cluster_meet(cluster_head_ip, cluster_head_port) for guest in new_guests: guest.cluster_complete()
def _add_shard_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False, shard_id=shard_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in shard %(shard_id)s: %(instance_ids)s", {'shard_id': shard_id, 'instance_ids': instance_ids}) if not self._all_instances_ready(instance_ids, cluster_id, shard_id): return members = [Instance.load(context, instance_id) for instance_id in instance_ids] db_query_routers = DBInstance.find_all(cluster_id=cluster_id, type='query_router', deleted=False).all() query_routers = [Instance.load(context, db_query_router.id) for db_query_router in db_query_routers] if not self._create_shard(query_routers[0], members): return for member in members: self.get_guest(member).cluster_complete()
def _grow_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id).all() cluster_head = next(Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in new_instance_ids) if not cluster_head: raise TroveError("Unable to determine existing Redis cluster " "member") (cluster_head_ip, cluster_head_port) = ( self.get_guest(cluster_head).get_node_ip()) # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") new_insts = [Instance.load(context, instance_id) for instance_id in new_instance_ids] new_guests = map(self.get_guest, new_insts) # Connect nodes to the cluster head for guest in new_guests: guest.cluster_meet(cluster_head_ip, cluster_head_port) for guest in new_guests: guest.cluster_complete()
def _add_shard_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, shard_id=shard_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in shard %s: %s" % (shard_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id, shard_id): return members = [ Instance.load(context, instance_id) for instance_id in instance_ids ] if not self._create_replica_set(members, cluster_id, shard_id): return db_query_routers = DBInstance.find_all(cluster_id=cluster_id, type='query_router', deleted=False).all() query_routers = [ Instance.load(context, db_query_router.id) for db_query_router in db_query_routers ] if not self._create_shard(query_routers, replica_set_name, members, cluster_id, shard_id): return for member in members: self.get_guest(member).cluster_complete()
def _shrink_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() all_instance_ids = [db_instance.id for db_instance in db_instances] remove_instances = [Instance.load(context, instance_id) for instance_id in instance_ids] left_instances = [Instance.load(context, instance_id) for instance_id in all_instance_ids if instance_id not in instance_ids] remove_member_ips = [self.get_ip(instance) for instance in remove_instances] k = VerticaCluster.k_safety(len(left_instances)) for db_instance in db_instances: if db_instance['type'] == 'master': master_instance = Instance.load(context, db_instance.id) if self.get_ip(master_instance) in remove_member_ips: raise RuntimeError(_("Cannot remove master instance!")) LOG.debug(_("Marking cluster k-safety: %s") % k) self.get_guest(master_instance).mark_design_ksafe(k) self.get_guest(master_instance).shrink_cluster( remove_member_ips) break for r in remove_instances: Instance.delete(r)
def _add_shard_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, shard_id=shard_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in shard %s: %s" % (shard_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id, shard_id): return members = [Instance.load(context, instance_id) for instance_id in instance_ids] db_query_routers = DBInstance.find_all(cluster_id=cluster_id, type='query_router', deleted=False).all() query_routers = [Instance.load(context, db_query_router.id) for db_query_router in db_query_routers] if not self._create_shard(query_routers[0], members): return for member in members: self.get_guest(member).cluster_complete()
def _shrink_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() all_instance_ids = [db_instance.id for db_instance in db_instances] remove_instances = [Instance.load(context, instance_id) for instance_id in instance_ids] left_instances = [Instance.load(context, instance_id) for instance_id in all_instance_ids if instance_id not in instance_ids] remove_member_ips = [self.get_ip(instance) for instance in remove_instances] k = VerticaCluster.k_safety(len(left_instances)) for db_instance in db_instances: if db_instance['type'] == 'master': master_instance = Instance.load(context, db_instance.id) if self.get_ip(master_instance) in remove_member_ips: raise RuntimeError(_("Cannot remove master instance!")) LOG.debug("Marking cluster k-safety: %s", k) self.get_guest(master_instance).mark_design_ksafe(k) self.get_guest(master_instance).shrink_cluster( remove_member_ips) break for r in remove_instances: Instance.delete(r)
def _grow_cluster(): new_instances = [db_instance for db_instance in self.db_instances if db_instance.id in instance_ids] new_members = [db_instance for db_instance in new_instances if db_instance.type == 'member'] new_query_routers = [db_instance for db_instance in new_instances if db_instance.type == 'query_router'] instances = [] if new_members: shard_ids = set([db_instance.shard_id for db_instance in new_members]) query_router_id = self._get_running_query_router_id() if not query_router_id: return for shard_id in shard_ids: LOG.debug('growing cluster by adding shard %(shard_id)s ' 'on query router %(router_id)s', {'shard_id': shard_id, 'router_id': query_router_id}) member_ids = [db_instance.id for db_instance in new_members if db_instance.shard_id == shard_id] if not self._all_instances_ready( member_ids, cluster_id, shard_id ): return members = [Instance.load(context, member_id) for member_id in member_ids] query_router = Instance.load(context, query_router_id) if not self._create_shard(query_router, members): return instances.extend(members) if new_query_routers: query_router_ids = [db_instance.id for db_instance in new_query_routers] config_servers_ids = [db_instance.id for db_instance in self.db_instances if db_instance.type == 'config_server'] LOG.debug('growing cluster by adding query routers ' '%(router)s, with config servers %(server)s', {'router': query_router_ids, 'server': config_servers_ids}) if not self._all_instances_ready( query_router_ids, cluster_id ): return query_routers = [Instance.load(context, instance_id) for instance_id in query_router_ids] config_servers_ips = [ self.get_ip(Instance.load(context, config_server_id)) for config_server_id in config_servers_ids ] if not self._add_query_routers( query_routers, config_servers_ips, admin_password=self.get_cluster_admin_password(context) ): return instances.extend(query_routers) for instance in instances: self.get_guest(instance).cluster_complete()
def _grow_cluster(): new_instances = [db_instance for db_instance in self.db_instances if db_instance.id in instance_ids] new_members = [db_instance for db_instance in new_instances if db_instance.type == 'member'] new_query_routers = [db_instance for db_instance in new_instances if db_instance.type == 'query_router'] instances = [] if new_members: shard_ids = set([db_instance.shard_id for db_instance in new_members]) query_router_id = self._get_running_query_router_id() if not query_router_id: return for shard_id in shard_ids: LOG.debug('growing cluster by adding shard %s on query ' 'router %s' % (shard_id, query_router_id)) member_ids = [db_instance.id for db_instance in new_members if db_instance.shard_id == shard_id] if not self._all_instances_ready( member_ids, cluster_id, shard_id ): return members = [Instance.load(context, member_id) for member_id in member_ids] query_router = Instance.load(context, query_router_id) if not self._create_shard(query_router, members): return instances.extend(members) if new_query_routers: query_router_ids = [db_instance.id for db_instance in new_query_routers] config_servers_ids = [db_instance.id for db_instance in self.db_instances if db_instance.type == 'config_server'] LOG.debug('growing cluster by adding query routers %s, ' 'with config servers %s' % (query_router_ids, config_servers_ids)) if not self._all_instances_ready( query_router_ids, cluster_id ): return query_routers = [Instance.load(context, instance_id) for instance_id in query_router_ids] config_servers_ips = [ self.get_ip(Instance.load(context, config_server_id)) for config_server_id in config_servers_ids ] if not self._add_query_routers( query_routers, config_servers_ips, admin_password=self.get_cluster_admin_password(context) ): return instances.extend(query_routers) for instance in instances: self.get_guest(instance).cluster_complete()
def _shrink_cluster(): removal_instances = [ Instance.load(context, instance_id) for instance_id in removal_instance_ids ] for instance in removal_instances: Instance.delete(instance) # wait for instances to be deleted def all_instances_marked_deleted(): non_deleted_instances = DBInstance.find_all( cluster_id=cluster_id, deleted=False).all() non_deleted_ids = [ db_instance.id for db_instance in non_deleted_instances ] return not bool( set(removal_instance_ids).intersection( set(non_deleted_ids))) try: LOG.info("Deleting instances (%s)", removal_instance_ids) utils.poll_until(all_instances_marked_deleted, sleep_time=2, time_out=CONF.cluster_delete_time_out) except PollTimeOut: LOG.error("timeout for instances to be marked as deleted.") return db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() leftover_instances = [ Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in removal_instance_ids ] leftover_cluster_ips = [ self.get_ip(instance) for instance in leftover_instances ] # Get config changes for left over instances rnd_cluster_guest = self.get_guest(leftover_instances[0]) cluster_context = rnd_cluster_guest.get_cluster_context() # apply the new config to all leftover instances for instance in leftover_instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(leftover_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) guest.write_cluster_configuration_overrides( cluster_configuration)
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") instances = [ Instance.load(context, instance_id) for instance_id in instance_ids ] member_ips = [self.get_ip(instance) for instance in instances] guests = [self.get_guest(instance) for instance in instances] # Users to be configured for password-less SSH. authorized_users_without_password = ['root', 'dbadmin'] # Configuring password-less SSH for cluster members. # Strategy for setting up SSH: # get public keys for user from member-instances in cluster, # combine them, finally push it back to all instances, # and member instances add them to authorized keys. LOG.debug("Configuring password-less SSH on cluster members.") try: for user in authorized_users_without_password: pub_key = [guest.get_public_keys(user) for guest in guests] for guest in guests: guest.authorize_public_keys(user, pub_key) LOG.debug("Installing cluster with members: %s." % member_ips) for db_instance in db_instances: if db_instance['type'] == 'master': master_instance = Instance.load( context, db_instance.id) self.get_guest(master_instance).install_cluster( member_ips) break LOG.debug("Finalizing cluster configuration.") for guest in guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") instances = [Instance.load(context, instance_id) for instance_id in instance_ids] member_ips = [self.get_ip(instance) for instance in instances] guests = [self.get_guest(instance) for instance in instances] # Users to be configured for password-less SSH. authorized_users_without_password = ['root', 'dbadmin'] # Configuring password-less SSH for cluster members. # Strategy for setting up SSH: # get public keys for user from member-instances in cluster, # combine them, finally push it back to all instances, # and member instances add them to authorized keys. LOG.debug("Configuring password-less SSH on cluster members.") try: for user in authorized_users_without_password: pub_key = [guest.get_public_keys(user) for guest in guests] for guest in guests: guest.authorize_public_keys(user, pub_key) LOG.debug("Installing cluster with members: %s." % member_ips) for db_instance in db_instances: if db_instance['type'] == 'master': master_instance = Instance.load(context, db_instance.id) self.get_guest(master_instance).install_cluster( member_ips) break LOG.debug("Finalizing cluster configuration.") for guest in guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)
def _shrink_cluster(): removal_instances = [Instance.load(context, instance_id) for instance_id in removal_instance_ids] for instance in removal_instances: Instance.delete(instance) # wait for instances to be deleted def all_instances_marked_deleted(): non_deleted_instances = DBInstance.find_all( cluster_id=cluster_id, deleted=False).all() non_deleted_ids = [db_instance.id for db_instance in non_deleted_instances] return not bool( set(removal_instance_ids).intersection( set(non_deleted_ids)) ) try: LOG.info(_("Deleting instances (%s)") % removal_instance_ids) utils.poll_until(all_instances_marked_deleted, sleep_time=2, time_out=CONF.cluster_delete_time_out) except PollTimeOut: LOG.error(_("timeout for instances to be marked as deleted.")) return db_instances = DBInstance.find_all(cluster_id=cluster_id).all() leftover_instances = [Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in removal_instance_ids] leftover_cluster_ips = [self.get_ip(instance) for instance in leftover_instances] # Get config changes for left over instances rnd_cluster_guest = self.get_guest(leftover_instances[0]) cluster_context = rnd_cluster_guest.get_cluster_context() # apply the new config to all leftover instances for instance in leftover_instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(leftover_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) guest.write_cluster_configuration_overrides( cluster_configuration)
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] added_nodes = [self.build_node_info(instance) for instance in new_instances] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) # Rebalance the cluster via one of the existing nodes. # Clients can continue to store and retrieve information and # do not need to be aware that a rebalance operation is taking # place. # The new nodes are marked active only if the rebalancing # completes. try: coordinator = cluster_nodes[0] self._add_nodes(coordinator, added_nodes) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def shrink(self, instances): """Removes instances from a cluster.""" LOG.debug("Shrinking cluster %s.", self.id) self.validate_cluster_available() removal_instances = [ Instance.load(self.context, inst_id) for inst_id in instances ] db_instances = DBInstance.find_all(cluster_id=self.db_info.id, deleted=False).all() if len(db_instances) - len(removal_instances) < 1: raise exception.ClusterShrinkMustNotLeaveClusterEmpty() self.db_info.update(task_status=ClusterTasks.SHRINKING_CLUSTER) try: task_api.load(self.context, self.ds_version.manager).shrink_cluster( self.db_info.id, [instance.id for instance in removal_instances]) except Exception: self.db_info.update(task_status=ClusterTasks.NONE) raise return self.__class__(self.context, self.db_info, self.ds, self.ds_version)
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] add_node_info = [self.build_node_info(instance) for instance in new_instances] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) old_node_info = [node for node in cluster_nodes if node['id'] not in new_instance_ids] # Rebalance the cluster via one of the existing nodes. # Clients can continue to store and retrieve information and # do not need to be aware that a rebalance operation is taking # place. coordinator = old_node_info[0] self._add_nodes(coordinator, add_node_info) LOG.debug("Cluster grow finished successfully.")
def _create_cluster(): # fetch instances by cluster_id against instances table db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in cluster %(cluster_id)s: %(instance_ids)s", {'cluster_id': cluster_id, 'instance_ids': instance_ids}) if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("all instances in cluster %s ready.", cluster_id) instances = [Instance.load(context, instance_id) for instance_id in instance_ids] # filter tidb_server in instances into a new list: query_routers tidb_server = [instance for instance in instances if instance.type == 'tidb_server'] LOG.debug("tidb_server: %s", [instance.id for instance in query_routers]) # filter pd_server in instances into new list: config_servers pd_server = [instance for instance in instances if instance.type == 'pd_server'] LOG.debug("pd_server: %s", [instance.id for instance in pd_server]) # filter tikv into a new list: tikvs tikv = [instance for instance in instances if instance.type == 'tikv'] LOG.debug("tikv: %s", [instance.id for instance in tikv])
def _create_resources(): # parse the ID from the Ref instance_id = utils.get_id_from_href(instance) # verify that the instance exists and can perform actions from trove.instance.models import Instance instance_model = Instance.load(context, instance_id) instance_model.validate_can_perform_action() cls.verify_swift_auth_token(context) try: db_info = DBBackup.create(name=name, description=description, tenant_id=context.tenant, state=BackupState.NEW, instance_id=instance_id, deleted=False) except exception.InvalidModelError as ex: LOG.exception("Unable to create Backup record:") raise exception.BackupCreationError(str(ex)) backup_info = {'id': db_info.id, 'name': name, 'description': description, 'instance_id': instance_id, 'backup_type': db_info.backup_type, 'checksum': db_info.checksum, } api.API(context).create_backup(backup_info, instance_id) return db_info
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] added_nodes = [ self.build_node_info(instance) for instance in new_instances ] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) # Rebalance the cluster via one of the existing nodes. # Clients can continue to store and retrieve information and # do not need to be aware that a rebalance operation is taking # place. # The new nodes are marked active only if the rebalancing # completes. try: coordinator = cluster_nodes[0] self._add_nodes(coordinator, added_nodes) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def _create_cluster(): # fetch instances by cluster_id against instances table db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in cluster %s: %s" % (cluster_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id): return instances = [Instance.load(context, instance_id) for instance_id in instance_ids] # filter query routers in instances into a new list: query_routers query_routers = [instance for instance in instances if instance.type == 'query_router'] LOG.debug("query routers: %s" % [instance.id for instance in query_routers]) # filter config servers in instances into new list: config_servers config_servers = [instance for instance in instances if instance.type == 'config_server'] LOG.debug("config servers: %s" % [instance.id for instance in config_servers]) # filter members (non router/configsvr) into a new list: members members = [instance for instance in instances if instance.type == 'member'] LOG.debug("members: %s" % [instance.id for instance in members]) # for config_server in config_servers, append ip/hostname to # "config_server_hosts", then # peel off the replica-set name and ip/hostname from 'x' config_server_ips = [self.get_ip(instance) for instance in config_servers] LOG.debug("config server ips: %s" % config_server_ips) LOG.debug("calling add_config_servers on query_routers") try: for query_router in query_routers: (self.get_guest(query_router) .add_config_servers(config_server_ips)) except Exception: LOG.exception(_("error adding config servers")) self.update_statuses_on_failure(cluster_id) return if not self._create_replica_set(members, cluster_id): return replica_set_name = "rs1" if not self._create_shard(query_routers, replica_set_name, members, cluster_id): return # call to start checking status for instance in instances: self.get_guest(instance).cluster_complete()
def _grow_cluster(): LOG.debug("begin grow_cluster for Vertica cluster %s" % cluster_id) db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for new cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): return new_insts = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] existing_instances = [ Instance.load(context, instance_id) for instance_id in instance_ids if instance_id not in new_instance_ids ] existing_guests = [self.get_guest(i) for i in existing_instances] new_guests = [self.get_guest(i) for i in new_insts] all_guests = new_guests + existing_guests authorized_users_without_password = ['root', 'dbadmin'] new_ips = [self.get_ip(instance) for instance in new_insts] for user in authorized_users_without_password: pub_key = [guest.get_public_keys(user) for guest in all_guests] for guest in all_guests: guest.authorize_public_keys(user, pub_key) for db_instance in db_instances: if db_instance['type'] == 'master': LOG.debug("Found 'master' instance, calling grow on guest") master_instance = Instance.load(context, db_instance.id) self.get_guest(master_instance).grow_cluster(new_ips) break for guest in new_guests: guest.cluster_complete()
def _create_cluster(): # fetch instances by cluster_id against instances table db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in cluster %s: %s" % (cluster_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("all instances in cluster %s ready." % cluster_id) instances = [ Instance.load(context, instance_id) for instance_id in instance_ids ] # filter query routers in instances into a new list: query_routers query_routers = [ instance for instance in instances if instance.type == 'query_router' ] LOG.debug("query routers: %s" % [instance.id for instance in query_routers]) # filter config servers in instances into new list: config_servers config_servers = [ instance for instance in instances if instance.type == 'config_server' ] LOG.debug("config servers: %s" % [instance.id for instance in config_servers]) # filter members (non router/configsvr) into a new list: members members = [ instance for instance in instances if instance.type == 'member' ] LOG.debug("members: %s" % [instance.id for instance in members]) # for config_server in config_servers, append ip/hostname to # "config_server_hosts", then # peel off the replica-set name and ip/hostname from 'x' config_server_ips = [ self.get_ip(instance) for instance in config_servers ] LOG.debug("config server ips: %s" % config_server_ips) if not self._add_query_routers(query_routers, config_server_ips): return if not self._create_shard(query_routers[0], members): return # call to start checking status for instance in instances: self.get_guest(instance).cluster_complete()
def grow_cluster(self, context, cluster_id, new_instance_ids): """Grow a K2hdkc Cluster.""" LOG.debug( "Begins grow_cluster for %s. new_instance_ids:{}".format( new_instance_ids), cluster_id) # 1. validates args if context is None: LOG.error("no context") return if cluster_id is None: LOG.error("no cluster_id") return if new_instance_ids is None: LOG.error("no new_instance_ids") return timeout = Timeout(CONF.cluster_usage_timeout) try: # 2. Retrieves db_instances from the database db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() LOG.debug("len(db_instances) {}".format(len(db_instances))) # 3. Checks if new instances are ready if not self._all_instances_running(new_instance_ids, cluster_id): LOG.error("instances are not ready yet") return # 4. Loads instances instances = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] LOG.debug("len(instances) {}".format(len(instances))) # 5. Instances GuestAgent class # 6. Calls cluster_complete endpoint of K2hdkcGuestAgent LOG.debug( "Calling cluster_complete as a final hook to each node in the cluster" ) for instance in instances: self.get_guest(instance).cluster_complete() # 7. reset the current cluster task status to None LOG.debug("reset cluster task to None") self.reset_task() except Timeout: # Note adminstrators should reset task via CLI in this case. if Timeout is not timeout: raise # not my timeout LOG.exception("Timeout for growing cluster.") self.update_statuses_on_failure( cluster_id, status=inst_tasks.InstanceTasks.GROWING_ERROR) finally: timeout.cancel() LOG.debug("Completed grow_cluster for %s.", cluster_id)
def _grow_cluster(): LOG.debug("begin grow_cluster for Vertica cluster %s" % cluster_id) db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for new cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): return new_insts = [Instance.load(context, instance_id) for instance_id in new_instance_ids] existing_instances = [Instance.load(context, instance_id) for instance_id in instance_ids if instance_id not in new_instance_ids] existing_guests = [self.get_guest(i) for i in existing_instances] new_guests = [self.get_guest(i) for i in new_insts] all_guests = new_guests + existing_guests authorized_users_without_password = ['root', 'dbadmin'] new_ips = [self.get_ip(instance) for instance in new_insts] for user in authorized_users_without_password: pub_key = [guest.get_public_keys(user) for guest in all_guests] for guest in all_guests: guest.authorize_public_keys(user, pub_key) for db_instance in db_instances: if db_instance['type'] == 'master': LOG.debug("Found 'master' instance, calling grow on guest") master_instance = Instance.load(context, db_instance.id) self.get_guest(master_instance).grow_cluster(new_ips) break for guest in new_guests: guest.cluster_complete()
def _create_resources(): # parse the ID from the Ref instance_id = utils.get_id_from_href(instance) # verify that the instance exists and can perform actions from trove.instance.models import Instance instance_model = Instance.load(context, instance_id) instance_model.validate_can_perform_action() cls.validate_can_perform_action(instance_model, 'backup_create') cls.verify_swift_auth_token(context) if instance_model.cluster_id is not None: raise exception.ClusterInstanceOperationNotSupported() ds = instance_model.datastore ds_version = instance_model.datastore_version parent = None if parent_id: # Look up the parent info or fail early if not found or if # the user does not have access to the parent. _parent = cls.get_by_id(context, parent_id) parent = { 'location': _parent.location, 'checksum': _parent.checksum, } try: db_info = DBBackup.create(name=name, description=description, tenant_id=context.tenant, state=BackupState.NEW, instance_id=instance_id, parent_id=parent_id, datastore_version_id=ds_version.id, deleted=False) except exception.InvalidModelError as ex: LOG.exception( _("Unable to create backup record for " "instance: %s"), instance_id) raise exception.BackupCreationError(str(ex)) backup_info = { 'id': db_info.id, 'name': name, 'description': description, 'instance_id': instance_id, 'backup_type': db_info.backup_type, 'checksum': db_info.checksum, 'parent': parent, 'datastore': ds.name, 'datastore_version': ds_version.name, } api.API(context).create_backup(backup_info, instance_id) return db_info
def _create_resources(): # parse the ID from the Ref instance_id = utils.get_id_from_href(instance) # verify that the instance exists and can perform actions from trove.instance.models import Instance instance_model = Instance.load(context, instance_id) instance_model.validate_can_perform_action() cls.validate_can_perform_action( instance_model, 'backup_create') cls.verify_swift_auth_token(context) if instance_model.cluster_id is not None: raise exception.ClusterInstanceOperationNotSupported() ds = instance_model.datastore ds_version = instance_model.datastore_version parent = None if parent_id: # Look up the parent info or fail early if not found or if # the user does not have access to the parent. _parent = cls.get_by_id(context, parent_id) parent = { 'location': _parent.location, 'checksum': _parent.checksum, } try: db_info = DBBackup.create(name=name, description=description, tenant_id=context.tenant, state=BackupState.NEW, instance_id=instance_id, parent_id=parent_id, datastore_version_id=ds_version.id, deleted=False) except exception.InvalidModelError as ex: LOG.exception(_("Unable to create backup record for " "instance: %s"), instance_id) raise exception.BackupCreationError(str(ex)) backup_info = {'id': db_info.id, 'name': name, 'description': description, 'instance_id': instance_id, 'backup_type': db_info.backup_type, 'checksum': db_info.checksum, 'parent': parent, 'datastore': ds.name, 'datastore_version': ds_version.name, } api.API(context).create_backup(backup_info, instance_id) return db_info
def create_cluster(self, context, cluster_id): """Create K2hdkcClusterTasks. This function is called in trove.taskmanager.Manager.create_cluster. """ LOG.debug("Begins create_cluster for %s.", cluster_id) # 1. validates args if context is None: LOG.error("no context") return if cluster_id is None: LOG.error("no cluster_id") return timeout = Timeout(CONF.cluster_usage_timeout) LOG.debug("CONF.cluster_usage_timeout %s.", timeout) try: # 2. Retrieves db_instances from the database db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() # 3. Retrieves instance ids from the db_instances instance_ids = [db_instance.id for db_instance in db_instances] # 4. Checks if instances are ready if not self._all_instances_running(instance_ids, cluster_id): LOG.error("instances are not ready yet") return # 5. Loads instances instances = [ Instance.load(context, instance_id) for instance_id in instance_ids ] # 6. Instantiates GuestAgent for each guest instance # 7. Calls cluster_complete endpoint of K2hdkcGuestAgent for instance in instances: self.get_guest(instance).cluster_complete() # 8. reset the current cluster task status to None LOG.debug("reset cluster task to None") self.reset_task() except Timeout: # Note adminstrators should reset task via CLI in this case. if Timeout is not timeout: raise # not my timeout LOG.exception("Timeout for building cluster.") self.update_statuses_on_failure(cluster_id) finally: timeout.cancel() LOG.debug("Completed create_cluster for %s.", cluster_id)
def _create_cluster(): # fetch instances by cluster_id against instances table db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in cluster %s: %s" % (cluster_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("all instances in cluster %s ready." % cluster_id) instances = [Instance.load(context, instance_id) for instance_id in instance_ids] # filter query routers in instances into a new list: query_routers query_routers = [instance for instance in instances if instance.type == 'query_router'] LOG.debug("query routers: %s" % [instance.id for instance in query_routers]) # filter config servers in instances into new list: config_servers config_servers = [instance for instance in instances if instance.type == 'config_server'] LOG.debug("config servers: %s" % [instance.id for instance in config_servers]) # filter members (non router/configsvr) into a new list: members members = [instance for instance in instances if instance.type == 'member'] LOG.debug("members: %s" % [instance.id for instance in members]) # for config_server in config_servers, append ip/hostname to # "config_server_hosts", then # peel off the replica-set name and ip/hostname from 'x' config_server_ips = [self.get_ip(instance) for instance in config_servers] LOG.debug("config server ips: %s" % config_server_ips) if not self._add_query_routers(query_routers, config_server_ips): return if not self._create_shard(query_routers[0], members): return # call to start checking status for instance in instances: self.get_guest(instance).cluster_complete()
def shrink(self, instances): """Removes instances from a cluster.""" LOG.debug("Shrinking cluster %s." % self.id) self.validate_cluster_available() removal_instances = [Instance.load(self.context, inst_id) for inst_id in instances] db_instances = DBInstance.find_all(cluster_id=self.db_info.id).all() if len(db_instances) - len(removal_instances) < 1: raise exception.ClusterShrinkMustNotLeaveClusterEmpty() self.db_info.update(task_status=ClusterTasks.SHRINKING_CLUSTER) task_api.load(self.context, self.ds_version.manager).shrink_cluster( self.db_info.id, [instance.id for instance in removal_instances] ) return PXCCluster(self.context, self.db_info, self.ds, self.ds_version)
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") instances = [ Instance.load(context, instance_id) for instance_id in instance_ids ] # Connect nodes to the first node guests = [self.get_guest(instance) for instance in instances] try: cluster_head = instances[0] cluster_head_port = '6379' cluster_head_ip = self.get_ip(cluster_head) for guest in guests[1:]: guest.cluster_meet(cluster_head_ip, cluster_head_port) num_nodes = len(instances) total_slots = 16384 slots_per_node = total_slots / num_nodes leftover_slots = total_slots % num_nodes first_slot = 0 for guest in guests: last_slot = first_slot + slots_per_node if leftover_slots > 0: leftover_slots -= 1 else: last_slot -= 1 guest.cluster_addslots(first_slot, last_slot) first_slot = last_slot + 1 for guest in guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)
def _create_resources(): # parse the ID from the Ref instance_id = utils.get_id_from_href(instance) # verify that the instance exists and can perform actions from trove.instance.models import Instance instance_model = Instance.load(context, instance_id) instance_model.validate_can_perform_action() cls.verify_swift_auth_token(context) parent = None if parent_id: # Look up the parent info or fail early if not found or if # the user does not have access to the parent. _parent = cls.get_by_id(context, parent_id) parent = {"location": _parent.location, "checksum": _parent.checksum} try: db_info = DBBackup.create( name=name, description=description, tenant_id=context.tenant, state=BackupState.NEW, instance_id=instance_id, parent_id=parent_id, deleted=False, ) except exception.InvalidModelError as ex: LOG.exception("Unable to create Backup record:") raise exception.BackupCreationError(str(ex)) backup_info = { "id": db_info.id, "name": name, "description": description, "instance_id": instance_id, "backup_type": db_info.backup_type, "checksum": db_info.checksum, "parent": parent, } api.API(context).create_backup(backup_info, instance_id) return db_info
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("All members ready, proceeding for cluster setup.") instances = [Instance.load(context, instance_id) for instance_id in instance_ids] # Connect nodes to the first node guests = [self.get_guest(instance) for instance in instances] try: cluster_head = instances[0] cluster_head_port = '6379' cluster_head_ip = self.get_ip(cluster_head) for guest in guests[1:]: guest.cluster_meet(cluster_head_ip, cluster_head_port) num_nodes = len(instances) total_slots = 16384 slots_per_node = total_slots / num_nodes leftover_slots = total_slots % num_nodes first_slot = 0 for guest in guests: last_slot = first_slot + slots_per_node if leftover_slots > 0: leftover_slots -= 1 else: last_slot -= 1 guest.cluster_addslots(first_slot, last_slot) first_slot = last_slot + 1 for guest in guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)
def load_cluster_nodes(cls, context, node_ids): return [ cls.build_node_info(Instance.load(context, node_id)) for node_id in node_ids ]
def get_cluster_admin_password(self, context): """The cluster admin's user credentials are stored on all query routers. Find one and get the guest to return the password. """ instance = Instance.load(context, self._get_running_query_router_id()) return self.get_guest(instance).get_admin_password()
def create(cls, context, instance_id, name, description=None, group_id=None, backup_type=None, expire_at=None, init=False, service_image_id=None, parent_id=None): if parent_id is not None: parent_id = str(parent_id) LOG.info("parent_id:%s, parent_id.len:%s", parent_id, len(parent_id.strip())) if len(parent_id.strip()) == 0: parent_id = None _parent_id = parent_id from trove.instance.models import Instance instance_id = utils.get_id_from_href(instance_id) instance_model = Instance.load(context, instance_id) if init: if instance_model.db_info.server_status != 'ACTIVE': msg = ("Instance is not currently available for an action to be " "performed (server_status was %s).", instance_model.db_info.server_status) LOG.error(msg) raise exception.UnprocessableEntity(msg) else: instance_model.validate_can_perform_action() if instance_model.type == DBInstanceType.MASTER: try: standby_id = InstanceGroupItem.get_by_gid_type(context, instance_model.group_id, DBInstanceType.STANDBY).instance_id instance_model = Instance.load(context, standby_id) instance_model.validate_can_perform_action() instance_id = standby_id except Exception as e: LOG.error(e) raise e if group_id is None: raise exception.TroveError("group_id can't None") if backup_type is None or backup_type not in [Type.SNAPSHOT, Type.AUTOBACKUP]: raise exception.TroveError("instType can't None, only accept value: snapshot or autobackup ") if backup_type == Type.SNAPSHOT: expire_time = 0 _parent_id = None # force full elif backup_type == Type.AUTOBACKUP: expire_time = int(expire_at) if parent_id and parent_id == '0': _parent_id = None # force full elif parent_id and parent_id != '0': try: backup_parent = cls.get_by_id(context, parent_id) LOG.debug("backup_parent:%s", backup_parent) except: raise exception.NotFound("not found backup with parent_id: %s" % parent_id) if not backup_parent: raise exception.NotFound("not found backup with parent_id: %s" % parent_id) elif parent_id is None: LOG.debug("parent_id is None:%s", parent_id) last_backup_chain = cls.get_last_backup_chain(group_id) backup_incremental_chain_size = CONF.backup_incremental_chain_size LOG.info("last_backup_chain: %s, backup_incremental_chain_size: %s", last_backup_chain, backup_incremental_chain_size) if len(last_backup_chain) == 0 \ or len(last_backup_chain) >= int(backup_incremental_chain_size): _parent_id = None # create full else: compare_instance = None try: compare_instance = InstanceGroupItem.get_by_gid_type(context, group_id, DBInstanceType.STANDBY) except exception.NotFound: # not has standby try: compare_instance = InstanceGroupItem.get_by_gid_type(context, group_id, DBInstanceType.SINGLE) except exception.NotFound: # not has single pass if compare_instance: compare_id = compare_instance.instance_id switched = False for b in last_backup_chain: # has standby if b["instance_id"] != compare_id: switched = True # create full LOG.debug("last_backup_chain: %s, switched: %s, backup_instance_id: %s, b.instance_id: %s" , last_backup_chain, switched, compare_id, b["instance_id"]) break if not switched: parent = last_backup_chain.pop() # create incremental _parent_id = parent["id"] else: # not found standby and single _parent_id = None # create full LOG.debug("create backup use parent_id: %s", _parent_id) def _create_resources(): try: db_info = models.DBBackup.create(name=name, description=description, tenant_id=context.tenant, state=models.BackupState.NEW, instance_id=instance_id, deleted=False, group_id=group_id, type=backup_type, expire_at=expire_time, service_image_id=service_image_id, parent_id=_parent_id) except exception.InvalidModelError as ex: LOG.exception("Unable to create Backup record:") msg = "Unable to create Backup record, group_id %s, instance_id %s, parent_id %s " % (group_id, _instance_id, _parent_id) AlarmRpc(context).alarm(context.tenant, level=AlarmRpc.LEVEL_ERROR, _type=AlarmRpc.TYPE_TASKMANAGER, message=msg+str(ex)) raise exception.BackupCreationError(str(ex)) api.API(context).create_backup(db_info.id, instance_id) return db_info return run_with_quotas(context.tenant, {'backups': 1}, _create_resources)
def _create_cluster(): # fetch instances by cluster_id against instances table db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("instances in cluster %s: %s" % (cluster_id, instance_ids)) if not self._all_instances_ready(instance_ids, cluster_id): return LOG.debug("all instances in cluster %s ready." % cluster_id) instances = [Instance.load(context, instance_id) for instance_id in instance_ids] # filter query routers in instances into a new list: query_routers query_routers = [instance for instance in instances if instance.type == 'query_router'] LOG.debug("query routers: %s" % [instance.id for instance in query_routers]) # filter config servers in instances into new list: config_servers config_servers = [instance for instance in instances if instance.type == 'config_server'] LOG.debug("config servers: %s" % [instance.id for instance in config_servers]) # filter members (non router/configsvr) into a new list: members members = [instance for instance in instances if instance.type == 'member'] LOG.debug("members: %s" % [instance.id for instance in members]) # for config_server in config_servers, append ip/hostname to # "config_server_hosts", then # peel off the replica-set name and ip/hostname from 'x' config_server_ips = [self.get_ip(instance) for instance in config_servers] LOG.debug("config server ips: %s" % config_server_ips) # Give the query routers the configsvr ips to connect to. # Create the admin user on the query routers. # The first will create the user, and the others will just reset # the password to the same value. LOG.debug("calling add_config_servers on, and sending admin user " "password to, query_routers") try: admin_created = False admin_password = utils.generate_random_password() for query_router in query_routers: guest = self.get_guest(query_router) guest.add_config_servers(config_server_ips) if admin_created: guest.store_admin_password(admin_password) else: guest.create_admin_user(admin_password) admin_created = True except Exception: LOG.exception(_("error adding config servers")) self.update_statuses_on_failure(cluster_id) return if not self._create_replica_set(members, cluster_id): return replica_set_name = "rs1" if not self._create_shard(query_routers, replica_set_name, members, cluster_id): return # call to start checking status for instance in instances: self.get_guest(instance).cluster_complete()
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] added_nodes = [self.build_node_info(instance) for instance in new_instances] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) # Recompute the seed nodes based on the updated cluster geometry. seeds = self.choose_seed_nodes(cluster_nodes) # Configure each cluster node with the updated list of seeds. # Since we are adding to an existing cluster, ensure that the # new nodes have auto-bootstrapping enabled. # Start the added nodes. try: LOG.debug("Selected seed nodes: %s" % seeds) # Update the seeds on all nodes. # Also retrieve the superuser password from one previously # existing node. admin_creds = None for node in cluster_nodes: LOG.debug("Configuring node: %s." % node["id"]) node["guest"].set_seeds(seeds) if (admin_creds is None) and (node not in added_nodes): admin_creds = node["guest"].get_admin_credentials() # Start any seeds from the added nodes first. LOG.debug("Starting new seed nodes.") for node in added_nodes: if node["ip"] in seeds: node["guest"].set_auto_bootstrap(True) node["guest"].store_admin_credentials(admin_creds) node["guest"].restart() node["guest"].cluster_complete() LOG.debug("All new seeds running, starting the remaining of " "added nodes.") for node in added_nodes: if node["ip"] not in seeds: node["guest"].set_auto_bootstrap(True) node["guest"].store_admin_credentials(admin_creds) node["guest"].restart() node["guest"].cluster_complete() # Run nodetool cleanup on each of the previously existing nodes # to remove the keys that no longer belong to those nodes. # Wait for cleanup to complete on one node before running # it on the next node. LOG.debug("Cleaning up orphan data on old cluster nodes.") for node in cluster_nodes: if node not in added_nodes: nid = node["id"] node["guest"].node_cleanup_begin() node["guest"].node_cleanup() LOG.debug("Waiting for node to finish its " "cleanup: %s" % nid) if not self._all_instances_running([nid], cluster_id): LOG.warning(_("Node did not complete cleanup " "successfully: %s") % nid) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def _create_resources(): # parse the ID from the Ref instance_id = utils.get_id_from_href(instance) # verify that the instance exists and can perform actions from trove.instance.models import Instance instance_model = Instance.load(context, instance_id) instance_model.validate_can_perform_action() cls.validate_can_perform_action( instance_model, 'backup_create') cls.verify_swift_auth_token(context) ds = instance_model.datastore ds_version = instance_model.datastore_version manager = (datastore_models.DatastoreVersion.load_by_uuid( ds_version.id).manager) if (not CONF.get(manager).enable_cluster_instance_backup and instance_model.cluster_id is not None): raise exception.ClusterInstanceOperationNotSupported() parent = None parent_backup = None parent_backup_id = None last_backup_id = None if parent_id: # Look up the parent info or fail early if not found or if # the user does not have access to the parent. if parent_id == 'last': # Need to assign parent_id to a new variable # parent_backup_id, otherwise an unbound variable error # will be thrown parent_backup = Backup.get_last_completed(context, instance_id, True) if parent_backup is None: raise exception.NotFound() parent_backup_id = parent_backup.id else: parent_backup_id = parent_id _parent = cls.get_by_id(context, parent_backup_id) parent = { 'id': parent_backup_id, 'location': _parent.location, 'checksum': _parent.checksum, } elif incremental: _parent = Backup.get_last_completed(context, instance_id) if _parent: parent = { 'location': _parent.location, 'checksum': _parent.checksum } last_backup_id = _parent.id try: db_info = DBBackup.create(name=name, description=description, tenant_id=context.tenant, state=BackupState.NEW, instance_id=instance_id, parent_id=parent_backup_id or last_backup_id, datastore_version_id=ds_version.id, deleted=False) except exception.InvalidModelError as ex: LOG.exception(_("Unable to create backup record for " "instance: %s"), instance_id) raise exception.BackupCreationError(str(ex)) backup_info = {'id': db_info.id, 'name': name, 'description': description, 'instance_id': instance_id, 'backup_type': db_info.backup_type, 'checksum': db_info.checksum, 'parent': parent, 'datastore': ds.name, 'datastore_version': ds_version.name, } api.API(context).create_backup(backup_info, instance_id) return db_info
def _grow_cluster(): db_instances = DBInstance.find_all( cluster_id=cluster_id, deleted=False).all() existing_instances = [Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in new_instance_ids] if not existing_instances: raise TroveError(_("Unable to determine existing cluster " "member(s)")) # get list of ips of existing cluster members existing_cluster_ips = [self.get_ip(instance) for instance in existing_instances] existing_instance_guests = [self.get_guest(instance) for instance in existing_instances] # get the cluster context to setup new members cluster_context = existing_instance_guests[0].get_cluster_context() # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): raise TroveError(_("Instances in cluster did not report " "ACTIVE")) LOG.debug("All members ready, proceeding for cluster setup.") # Get the new instances to join the cluster new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] new_cluster_ips = [self.get_ip(instance) for instance in new_instances] for instance in new_instances: guest = self.get_guest(instance) guest.reset_admin_password(cluster_context['admin_password']) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(existing_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) # push the cluster config and bootstrap the first instance bootstrap = False guest.install_cluster(cluster_context['replication_user'], cluster_configuration, bootstrap) self._check_cluster_for_root(context, existing_instances, new_instances) # apply the new config to all instances for instance in existing_instances + new_instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(existing_cluster_ips + new_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) guest.write_cluster_configuration_overrides( cluster_configuration) for instance in new_instances: guest = self.get_guest(instance) guest.cluster_complete()
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("Waiting for instances to get to cluster-ready status.") # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): raise TroveError( _("Instances in cluster did not report " "ACTIVE")) LOG.debug("All members ready, proceeding for cluster setup.") instances = [ Instance.load(context, instance_id) for instance_id in instance_ids ] cluster_ips = [self.get_ip(instance) for instance in instances] instance_guests = [ self.get_guest(instance) for instance in instances ] # Create replication user and password for synchronizing the # galera cluster replication_user = { "name": self.CLUSTER_REPLICATION_USER, "password": utils.generate_random_password(), } # Galera cluster name must be unique and be shorter than a full # uuid string so we remove the hyphens and chop it off. It was # recommended to be 16 chars or less. # (this is not currently documented on Galera docs) cluster_name = utils.generate_uuid().replace("-", "")[:16] LOG.debug("Configuring cluster configuration.") try: # Set the admin password for all the instances because the # password in the my.cnf will be wrong after the joiner # instances syncs with the donor instance. admin_password = str(utils.generate_random_password()) for guest in instance_guests: guest.reset_admin_password(admin_password) bootstrap = True for instance in instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(cluster_ips), cluster_name, replication_user) # push the cluster config and bootstrap the first instance guest.install_cluster(replication_user, cluster_configuration, bootstrap) bootstrap = False LOG.debug("Finalizing cluster configuration.") for guest in instance_guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)
def _grow_cluster(): db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() existing_instances = [ Instance.load(context, db_inst.id) for db_inst in db_instances if db_inst.id not in new_instance_ids ] if not existing_instances: raise TroveError( _("Unable to determine existing cluster " "member(s)")) # get list of ips of existing cluster members existing_cluster_ips = [ self.get_ip(instance) for instance in existing_instances ] existing_instance_guests = [ self.get_guest(instance) for instance in existing_instances ] # get the cluster context to setup new members cluster_context = existing_instance_guests[0].get_cluster_context() # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(new_instance_ids, cluster_id): raise TroveError( _("Instances in cluster did not report " "ACTIVE")) LOG.debug("All members ready, proceeding for cluster setup.") # Get the new instances to join the cluster new_instances = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] new_cluster_ips = [ self.get_ip(instance) for instance in new_instances ] for instance in new_instances: guest = self.get_guest(instance) guest.reset_admin_password(cluster_context['admin_password']) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(existing_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) # push the cluster config and bootstrap the first instance bootstrap = False guest.install_cluster(cluster_context['replication_user'], cluster_configuration, bootstrap) self._check_cluster_for_root(context, existing_instances, new_instances) # apply the new config to all instances for instance in existing_instances + new_instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(existing_cluster_ips + new_cluster_ips), cluster_context['cluster_name'], cluster_context['replication_user']) guest.write_cluster_configuration_overrides( cluster_configuration) for instance in new_instances: guest = self.get_guest(instance) guest.cluster_complete()
def load_cluster_nodes(cls, context, node_ids): return [cls.build_node_info(Instance.load(context, node_id)) for node_id in node_ids]
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] added_nodes = [self.build_node_info(instance) for instance in new_instances] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) # Recompute the seed nodes based on the updated cluster geometry. seeds = self.choose_seed_nodes(cluster_nodes) # Configure each cluster node with the updated list of seeds. # Since we are adding to an existing cluster, ensure that the # new nodes have auto-bootstrapping enabled. # Start the added nodes. try: LOG.debug("Selected seed nodes: %s" % seeds) # Update the seeds on all nodes. # Also retrieve the superuser password from one previously # existing node. admin_creds = None for node in cluster_nodes: LOG.debug("Configuring node: %s." % node['id']) node['guest'].set_seeds(seeds) if (admin_creds is None) and (node not in added_nodes): admin_creds = node['guest'].get_admin_credentials() # Start any seeds from the added nodes first. LOG.debug("Starting new seed nodes.") for node in added_nodes: if node['ip'] in seeds: node['guest'].set_auto_bootstrap(True) node['guest'].store_admin_credentials(admin_creds) node['guest'].restart() node['guest'].cluster_complete() LOG.debug("All new seeds running, starting the remaining of " "added nodes.") for node in added_nodes: if node['ip'] not in seeds: node['guest'].set_auto_bootstrap(True) node['guest'].store_admin_credentials(admin_creds) node['guest'].restart() node['guest'].cluster_complete() # Run nodetool cleanup on each of the previously existing nodes # to remove the keys that no longer belong to those nodes. # Wait for cleanup to complete on one node before running # it on the next node. LOG.debug("Cleaning up orphan data on old cluster nodes.") for node in cluster_nodes: if node not in added_nodes: nid = node['id'] node['guest'].node_cleanup_begin() node['guest'].node_cleanup() LOG.debug("Waiting for node to finish its " "cleanup: %s" % nid) if not self._all_instances_running([nid], cluster_id): LOG.warning(_("Node did not complete cleanup " "successfully: %s") % nid) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def shrink_cluster(self, context, cluster_id, removal_ids): """Shrink a K2hdkc Cluster.""" LOG.debug( "Begins shrink_cluster for %s. removal_ids:{}".format(removal_ids), cluster_id) # 1. validates args if context is None: LOG.error("no context") return if cluster_id is None: LOG.error("no cluster_id") return if removal_ids is None: LOG.error("no removal_ids") return timeout = Timeout(CONF.cluster_usage_timeout) try: # 2. Retrieves db_instances from the database db_instances = DBInstance.find_all(cluster_id=cluster_id, deleted=False).all() # 3. Retrieves instance ids from the db_instances instance_ids = [db_instance.id for db_instance in db_instances] # 4. Checks if instances are running if not self._all_instances_running(instance_ids, cluster_id): LOG.error("instances are not ready yet") return # 4. Loads instances instances = [ Instance.load(context, instance_id) for instance_id in removal_ids ] LOG.debug("len(instances) {}".format(len(instances))) # 5. Instances GuestAgent class # 6.2. Checks if removing instances are # if not self._all_instances_shutdown(removal_ids, cluster_id): # LOG.error("removing instances are not shutdown yet") # return # 7. Calls cluster_complete endpoint of K2hdkcGuestAgent LOG.debug( "Calling cluster_complete as a final hook to each node in the cluster" ) for instance in instances: self.get_guest(instance).cluster_complete() # 8. delete node from OpenStack LOG.debug("delete node from OpenStack") for instance in instances: Instance.delete(instance) # 9. reset the current cluster task status to None LOG.debug("reset cluster task to None") self.reset_task() except Timeout: # Note adminstrators should reset task via CLI in this case. if Timeout is not timeout: raise # not my timeout LOG.exception("Timeout for shrink cluster.") self.update_statuses_on_failure( cluster_id, status=inst_tasks.InstanceTasks.SHRINKING_ERROR) finally: timeout.cancel() LOG.debug("Completed shrink_cluster for %s.", cluster_id)
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [ Instance.load(context, instance_id) for instance_id in new_instance_ids ] added_nodes = [ self.build_node_info(instance) for instance in new_instances ] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) old_nodes = [ node for node in cluster_nodes if node['id'] not in new_instance_ids ] try: # All nodes should have the same seeds and credentials. # Retrieve the information from the first node. test_node = old_nodes[0] current_seeds = test_node['guest'].get_seeds() admin_creds = test_node['guest'].get_admin_credentials() # Bootstrap new nodes. # Seed nodes do not bootstrap. Current running nodes # must be used as seeds during the process. # Since we are adding to an existing cluster, ensure that the # new nodes have auto-bootstrapping enabled. # Start the added nodes. LOG.debug("Starting new nodes.") for node in added_nodes: node['guest'].set_auto_bootstrap(True) node['guest'].set_seeds(current_seeds) node['guest'].store_admin_credentials(admin_creds) node['guest'].restart() node['guest'].cluster_complete() # Recompute the seed nodes based on the updated cluster # geometry. seeds = self.choose_seed_nodes(cluster_nodes) # Configure each cluster node with the updated list of seeds. LOG.debug("Updating all nodes with new seeds: %s", seeds) for node in cluster_nodes: node['guest'].set_seeds(seeds) # Run nodetool cleanup on each of the previously existing nodes # to remove the keys that no longer belong to those nodes. # Wait for cleanup to complete on one node before running # it on the next node. LOG.debug("Cleaning up orphan data on old cluster nodes.") for node in old_nodes: nid = node['id'] node['guest'].node_cleanup_begin() node['guest'].node_cleanup() LOG.debug("Waiting for node to finish its " "cleanup: %s", nid) if not self._all_instances_running([nid], cluster_id): LOG.warning( _("Node did not complete cleanup " "successfully: %s"), nid) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def _grow_cluster(): # Wait for new nodes to get to cluster-ready status. LOG.debug("Waiting for new nodes to become ready.") if not self._all_instances_ready(new_instance_ids, cluster_id): return new_instances = [Instance.load(context, instance_id) for instance_id in new_instance_ids] added_nodes = [self.build_node_info(instance) for instance in new_instances] LOG.debug("All nodes ready, proceeding with cluster setup.") cluster_node_ids = self.find_cluster_node_ids(cluster_id) cluster_nodes = self.load_cluster_nodes(context, cluster_node_ids) old_nodes = [node for node in cluster_nodes if node['id'] not in new_instance_ids] try: # All nodes should have the same seeds and credentials. # Retrieve the information from the first node. test_node = old_nodes[0] current_seeds = test_node['guest'].get_seeds() admin_creds = test_node['guest'].get_admin_credentials() # Bootstrap new nodes. # Seed nodes do not bootstrap. Current running nodes # must be used as seeds during the process. # Since we are adding to an existing cluster, ensure that the # new nodes have auto-bootstrapping enabled. # Start the added nodes. LOG.debug("Starting new nodes.") for node in added_nodes: node['guest'].set_auto_bootstrap(True) node['guest'].set_seeds(current_seeds) node['guest'].store_admin_credentials(admin_creds) node['guest'].restart() node['guest'].cluster_complete() # Recompute the seed nodes based on the updated cluster # geometry. seeds = self.choose_seed_nodes(cluster_nodes) # Configure each cluster node with the updated list of seeds. LOG.debug("Updating all nodes with new seeds: %s" % seeds) for node in cluster_nodes: node['guest'].set_seeds(seeds) # Run nodetool cleanup on each of the previously existing nodes # to remove the keys that no longer belong to those nodes. # Wait for cleanup to complete on one node before running # it on the next node. LOG.debug("Cleaning up orphan data on old cluster nodes.") for node in old_nodes: nid = node['id'] node['guest'].node_cleanup_begin() node['guest'].node_cleanup() LOG.debug("Waiting for node to finish its " "cleanup: %s" % nid) if not self._all_instances_running([nid], cluster_id): LOG.warning(_("Node did not complete cleanup " "successfully: %s") % nid) LOG.debug("Cluster configuration finished successfully.") except Exception: LOG.exception(_("Error growing cluster.")) self.update_statuses_on_failure(cluster_id)
def _create_cluster(): # Fetch instances by cluster_id against instances table. db_instances = DBInstance.find_all(cluster_id=cluster_id).all() instance_ids = [db_instance.id for db_instance in db_instances] LOG.debug("Waiting for instances to get to cluster-ready status.") # Wait for cluster members to get to cluster-ready status. if not self._all_instances_ready(instance_ids, cluster_id): raise TroveError(_("Instances in cluster did not report " "ACTIVE")) LOG.debug("All members ready, proceeding for cluster setup.") instances = [Instance.load(context, instance_id) for instance_id in instance_ids] cluster_ips = [self.get_ip(instance) for instance in instances] instance_guests = [self.get_guest(instance) for instance in instances] # Create replication user and password for synchronizing the # galera cluster replication_user = { "name": self.CLUSTER_REPLICATION_USER, "password": utils.generate_random_password(), } # Galera cluster name must be unique and be shorter than a full # uuid string so we remove the hyphens and chop it off. It was # recommended to be 16 chars or less. # (this is not currently documented on Galera docs) cluster_name = utils.generate_uuid().replace("-", "")[:16] LOG.debug("Configuring cluster configuration.") try: # Set the admin password for all the instances because the # password in the my.cnf will be wrong after the joiner # instances syncs with the donor instance. admin_password = str(utils.generate_random_password()) for guest in instance_guests: guest.reset_admin_password(admin_password) bootstrap = True for instance in instances: guest = self.get_guest(instance) # render the conf.d/cluster.cnf configuration cluster_configuration = self._render_cluster_config( context, instance, ",".join(cluster_ips), cluster_name, replication_user) # push the cluster config and bootstrap the first instance guest.install_cluster(replication_user, cluster_configuration, bootstrap) bootstrap = False LOG.debug("Finalizing cluster configuration.") for guest in instance_guests: guest.cluster_complete() except Exception: LOG.exception(_("Error creating cluster.")) self.update_statuses_on_failure(cluster_id)