def on_leaving_cluster(self, cluster_view): # Cassandra commands need to be run in the signaling network # namespace in split network systems. # # This function means that there are now two ways of running a # command in the signaling namespace - this function, and the # namespace parameter to run_command. This plugin does not have # access to the signaling namespace name, so we use this function # as a tactical workaround. def in_sig_namespace(command): prefix = "/usr/share/clearwater/bin/run-in-signaling-namespace " return prefix + command # We must remove the node from the cassandra cluster. Get the node's ID # from nodetool status, then remove it with nodetool remove try: status_command = "nodetool status | grep " + self._ip output = subprocess.check_output(in_sig_namespace(status_command), shell=True, stderr=subprocess.STDOUT) _log.debug( "Nodetool status succeeded and printed output {!r}".format( output)) except subprocess.CalledProcessError: # pragma: no coverage _log.debug("hit error") if output != "": # Pull the UUID from the output for value in output.split(): if "-" in value: remove_command = "nodetool removenode " + value run_command(in_sig_namespace(remove_command)) break
def on_config_changed(self, value, alarm): _log.info("Updating the shared iFC sets configuration file") if self.status(value) != FileStatus.UP_TO_DATE: safely_write(_file, value) run_command(["/usr/share/clearwater/bin/reload_shared_ifcs_xml"]) alarm.update_file(_file)
def on_config_changed(self, value, alarm): _log.info("Updating {}".format(self._file)) safely_write(self._file, value) run_command("service sprout reload") alarm.update_file(self._file)
def on_config_changed(self, value, alarm): _log.info("Updating {}".format(self._file)) safely_write(self._file, value) run_command("service sprout reload") alarm.update_file(self._file)
def on_config_changed(self, value, alarm): _log.info("Updating {}".format(self._file)) with open(self._file, "w") as ofile: ofile.write(value); run_command("service sprout reload") alarm.update_file(self._file)
def on_config_changed(self, value, alarm): _log.info("Updating shared configuration file") if self.status(value) != FileStatus.UP_TO_DATE: safely_write(_file, value) if value != _default_value: run_command( "/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue add apply_config" )
def on_config_changed(self, value, alarm): _log.info("Updating Chronos shared configuration file") if self.status(value) != FileStatus.UP_TO_DATE: safely_write(_file, value) run_command([ "/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue", "add", "apply_chronos_shared_config" ]) alarm.update_file(_file)
def on_stable_cluster(self, cluster_view): # pragma: no cover _log.debug("Clearing Cassandra not-clustered alarm") self._clustering_alarm.clear() pdlogs.STABLE_CLUSTER.log(cluster_desc=self.cluster_description()) if (self._ip == sorted(cluster_view.keys())[0]): _log.debug("Adding schemas") run_command( "/usr/share/clearwater/infrastructure/scripts/cassandra_schemas/run_cassandra_schemas" )
def at_front_of_queue(self): _log.info("Restarting Chronos") if run_command("service chronos stop"): _log.warning("Unable to stop Chronos successfully") if run_command("service chronos wait-sync"): _log.warning("Unable to resync Chronos successfully") if run_command( "/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue remove_success apply_chronos_gr_config" ): _log.warning("Unable to remove this node from the resync queue") _log.info("Chronos restarted")
def on_config_changed(self, value, alarm): _log.info("Updating SAS configuration file") if self.status(value) != FileStatus.UP_TO_DATE: safely_write(_file, value) run_command(["/usr/share/clearwater/infrastructure/scripts/sas_socket_factory"]) apply_config_key = subprocess.check_output(["/usr/share/clearwater/clearwater-queue-manager/scripts/get_apply_config_key"]) run_command(["/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue", "add", apply_config_key]) alarm.update_file(_file)
def wait_for_cassandra(self): # Don't start Cassandra, just rely on monit or supervisord to start it # - this avoids race conditions where both we and monit start it at the # same time and two copies start up. _log.info("Waiting for Cassandra to come up...") # Wait until we can connect on port 9160 - i.e. Cassandra is running. attempts = 0; while not self.can_contact_cassandra(): # pragma: no cover # Sleep so we don't tight loop time.sleep(1) attempts += 1 if ((attempts % 10) == 0): _log.info("Still waiting for Cassandra to come up...") _log.info("Finished waiting for Cassandra to come up") # Restart clearwater-infrastructure so any necessary schema creation # scripts get run run_command(["sudo", "service", "clearwater-infrastructure", "restart"])
def leave_cassandra_cluster(self): # We need Cassandra to be running so that we can connect on port 9160 and # decommission it. Check if we can connect on port 9160. if not self.can_contact_cassandra(): # pragma: no cover self.wait_for_cassandra() # Remove the cassandra.yaml file first - Cassandra won't start up while # it's missing, so this prevents monit or supervisord from # auto-restarting it after decommissioning. if os.path.exists(self.CASSANDRA_YAML_FILE): os.remove(self.CASSANDRA_YAML_FILE) run_command(["nodetool", "decommission"], self._sig_namespace) # Remove the bootstrapping flags so that we bootstrap correctly # if rejoining the cluster again in future. if os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG): os.remove(self.BOOTSTRAP_IN_PROGRESS_FLAG) if os.path.exists(self.BOOTSTRAPPED_FLAG): os.remove(self.BOOTSTRAPPED_FLAG)
def remove_node(self): # pragma: no cover try: args = ["/usr/share/clearwater/bin/run-in-signaling-namespace", "nodetool", "status"] process_nodetool = subprocess.Popen(args, stdout=subprocess.PIPE) process_grep = subprocess.Popen(['grep', self._ip], stdin=process_nodetool.stdout, stdout=subprocess.PIPE) process_nodetool.stdout.close() output = process_grep.communicate()[0] _log.debug("Nodetool status succeeded and printed output {!r}". format(output)) except subprocess.CalledProcessError: # pragma: no coverage _log.debug("hit error") if output != "": # Pull the UUID from the output for value in output.split(): if "-" in value: remove_command = ["/usr/share/clearwater/bin/run-in-signaling-namespace", "nodetool", "removenode", value] run_command(remove_command) break
def on_config_changed(self, value, alarm): if os.path.exists(_file) and not os.path.exists(_file + ".apply"): _log.debug("Ignoring shared config change - Shared config already learnt") return _log.info("Updating shared configuration") with open(_file + ".tmp", "w") as ofile: ofile.write(value) shutil.move(_file + ".tmp", _file) _log.info("Restarting services") run_command("service clearwater-infrastructure restart") for service, command in services.iteritems(): if os.path.exists("/etc/init.d/" + service): run_command("service {} {}".format(service, command)) # Config file is now up-to-date alarm.update_file(_file) # Remove the apply file if present. try: os.remove(_file + ".apply") except OSError: pass
def add_cassandra_schemas(self): # pragma: no cover # Adding the schemas can fail when Cassandra has just started. If it # does fail, simply try again until it succeeds. _log.info("Trying to add/update the Cassandra schemas...") attempts = 0; while True: rc = run_command(["/usr/share/clearwater/infrastructure/scripts/cassandra_schemas/run_cassandra_schemas"]) if rc == 0: break time.sleep(1) attempts += 1 if ((attempts % 10) == 0): _log.info("Still trying to add/update the Cassandra schemas...") _log.info("Finished adding/updating the schemas")
def at_front_of_queue(self): _log.info("Restarting clearwater-infrastructure") run_command("service clearwater-infrastructure restart") if os.path.exists("/usr/share/clearwater/infrastructure/scripts/restart"): _log.info("Restarting services") for restart_script in os.listdir("/usr/share/clearwater/infrastructure/scripts/restart"): run_command("/usr/share/clearwater/infrastructure/scripts/restart/" + restart_script) if self._wait_plugin_complete != "N": _log.info("Checking service health") if run_command("/usr/share/clearwater/clearwater-queue-manager/scripts/check_node_health.py"): _log.info("Services failed to restart successfully") run_command("/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue remove_failure apply_config") else: _log.info("Services restarted successfully") run_command("/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue remove_success apply_config") else: _log.info("Not checking service health") run_command("/usr/share/clearwater/clearwater-queue-manager/scripts/modify_nodes_in_queue remove_success apply_config")
def write_new_cassandra_config(self, seeds_list, destructive_restart=False): seeds_list_str = ','.join(map(str, seeds_list)) _log.info("Cassandra seeds list is {}".format(seeds_list_str)) # Read cassandra.yaml template. with open(self.CASSANDRA_YAML_TEMPLATE) as f: doc = yaml.safe_load(f) # Fill in the correct listen_address and seeds values in the yaml # document. doc["listen_address"] = self._ip doc["broadcast_rpc_address"] = self._ip doc["seed_provider"][0]["parameters"][0]["seeds"] = seeds_list_str doc["endpoint_snitch"] = "GossipingPropertyFileSnitch" # We use Thrift timeouts of 250ms, and we need the Cassandra timeouts to # be able to time out before that, including inter-node latency, so we # set timeouts of 190ms for reads, range-reads and writes doc["read_request_timeout_in_ms"] = 190 doc["range_request_timeout_in_ms"] = 190 doc["write_request_timeout_in_ms"] = 190 # Commit logs. We want to cap these, as the default of 8GB is sufficient # to exhaust the root filesystem on a low-spec (20GB) node, but we should # allow higher spec machines to use more diskspace to avoid thrashing. # # Therefore, set the upper threshold for commit logs to be 1GB per core # (up to the maximum for a 64bit machine - namely 8192). # # We ignore security analysis here, as although we are shelling out, # we are doing so with a fixed command, so it's safe to do so. For # safety, we always force the result to be an integer. get_core_count = "grep processor /proc/cpuinfo | wc -l" core_count = subprocess.check_output(get_core_count, # nosec shell=True, stderr=subprocess.STDOUT) try: core_count_int = int(core_count) except ValueError: # pragma: no cover core_count_int = 2 doc["commitlog_total_space_in_mb"] = min(core_count_int * 1024, 8192) contents = WARNING_HEADER + "\n" + yaml.dump(doc) topology = WARNING_HEADER + "\n" + "dc={}\nrack=RAC1\n".format(self._local_site) # Restart Cassandra and make sure it picks up the new list of seeds. _log.info("Restarting Cassandra") # Remove the cassandra.yaml file first - Cassandra won't start up while # it's missing, so this keeps it stopped while we're clearing out its # database if os.path.exists(self.CASSANDRA_YAML_FILE): os.remove(self.CASSANDRA_YAML_FILE) # Stop Cassandra directly rather than going through any 'service' # commands - this should mean that supervisord keeps restarting # Cassandra when running in Docker. # # Note that we can't use the init.d script here, because cassandra.yaml # doesn't exist so it immediately exits. # # We do not want to kill cassandra if it is in the process of bootstrapping if not os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG): run_command(["start-stop-daemon", "-K", "-p", "/var/run/cassandra/cassandra.pid", "-R", "TERM/30/KILL/5"]) _log.info("Stopped Cassandra while changing config files") # We only want to perform these steps the first time we join a cluster # If we are bootstrapping, or already bootstrapped, doing this will leave # us unable to rejoin the cluster properly if ((destructive_restart) and not ((os.path.exists(self.BOOTSTRAPPED_FLAG) or (os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG))))): _log.warn("Deleting /var/lib/cassandra - this is normal on initial clustering") run_command(["rm", "-rf", "/var/lib/cassandra/"]) run_command(["mkdir", "-m", "755", "/var/lib/cassandra"]) run_command(["chown", "-R", "cassandra", "/var/lib/cassandra"]) # Set a state flag if we have performed a destructive restart, and not yet # completed bootstrapping. This will stop us re-deleting the data directory # if the cluster_manager dies, ensuring we cluster correctly. open(self.BOOTSTRAP_IN_PROGRESS_FLAG, 'a').close() # Write back to cassandra.yaml - this allows Cassandra to start again. safely_write(self.CASSANDRA_TOPOLOGY_FILE, topology) safely_write(self.CASSANDRA_YAML_FILE, contents) self.wait_for_cassandra() # If we were previously bootstrapping, alter the state flag to indicate # the process is complete. We will remove this when we leave the cluster if os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG): os.rename(self.BOOTSTRAP_IN_PROGRESS_FLAG, self.BOOTSTRAPPED_FLAG) if os.path.exists("/etc/clearwater/force_cassandra_yaml_refresh"): os.remove("/etc/clearwater/force_cassandra_yaml_refresh")
def on_cluster_changing(self, cluster_view): write_memcached_cluster_settings("/etc/clearwater/cluster_settings", cluster_view) run_command("/usr/share/clearwater/bin/reload_memcached_users")
def write_new_cassandra_config(self, seeds_list, destructive_restart=False): seeds_list_str = ','.join(map(str, seeds_list)) _log.info("Cassandra seeds list is {}".format(seeds_list_str)) # Read cassandra.yaml template. with open(self.CASSANDRA_YAML_TEMPLATE) as f: doc = yaml.load(f) # Fill in the correct listen_address and seeds values in the yaml # document. doc["listen_address"] = self._ip doc["broadcast_rpc_address"] = self._ip doc["seed_provider"][0]["parameters"][0]["seeds"] = seeds_list_str doc["endpoint_snitch"] = "GossipingPropertyFileSnitch" # Work out the timeout from the target_latency_us value (assuming # 100000 if it isn't set) get_latency_cmd = "target_latency_us=100000; . /etc/clearwater/config; echo -n $target_latency_us" latency = subprocess.check_output(get_latency_cmd, shell=True, stderr=subprocess.STDOUT) try: # We want the timeout value to be 4/5ths the maximum acceptable time # of a HTTP request (which is 5 * target latency) timeout = (int(latency) / 1000) * 4 except ValueError: # pragma: no cover timeout = 400 doc["read_request_timeout_in_ms"] = timeout contents = WARNING_HEADER + "\n" + yaml.dump(doc) topology = WARNING_HEADER + "\n" + "dc={}\nrack=RAC1\n".format( self._local_site) # Restart Cassandra and make sure it picks up the new list of seeds. _log.info("Restarting Cassandra") # Remove the cassandra.yaml file first - Cassandra won't start up while # it's missing, so this keeps it stopped while we're clearing out its # database if os.path.exists(self.CASSANDRA_YAML_FILE): os.remove(self.CASSANDRA_YAML_FILE) # Stop Cassandra directly rather than going through any 'service' # commands - this should mean that supervisord keeps restarting # Cassandra when running in Docker. # # Note that we can't use the init.d script here, because cassandra.yaml # doesn't exist so it immediately exits. # # We do not want to kill cassandra if it is in the process of bootstrapping if not os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG): run_command( "start-stop-daemon -K -p /var/run/cassandra/cassandra.pid -R TERM/30/KILL/5" ) _log.info("Stopped Cassandra while changing config files") # We only want to perform these steps the first time we join a cluster # If we are bootstrapping, or already bootstrapped, doing this will leave # us unable to rejoin the cluster properly if ((destructive_restart) and not ((os.path.exists(self.BOOTSTRAPPED_FLAG) or (os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG))))): _log.warn( "Deleting /var/lib/cassandra - this is normal on initial clustering" ) run_command("rm -rf /var/lib/cassandra/") run_command("mkdir -m 755 /var/lib/cassandra") run_command("chown -R cassandra /var/lib/cassandra") # Set a state flag if we have performed a destructive restart, and not yet # completed bootstrapping. This will stop us re-deleting the data directory # if the cluster_manager dies, ensuring we cluster correctly. open(self.BOOTSTRAP_IN_PROGRESS_FLAG, 'a').close() # Write back to cassandra.yaml - this allows Cassandra to start again. safely_write(self.CASSANDRA_TOPOLOGY_FILE, topology) safely_write(self.CASSANDRA_YAML_FILE, contents) self.wait_for_cassandra() # If we were previously bootstrapping, alter the state flag to indicate # the process is complete. We will remove this when we leave the cluster if os.path.exists(self.BOOTSTRAP_IN_PROGRESS_FLAG): os.rename(self.BOOTSTRAP_IN_PROGRESS_FLAG, self.BOOTSTRAPPED_FLAG) if os.path.exists("/etc/clearwater/force_cassandra_yaml_refresh"): os.remove("/etc/clearwater/force_cassandra_yaml_refresh")
def write_cluster_settings(self, cluster_view): if self._remote_site != "": write_memcached_cluster_settings( "/etc/clearwater/remote_cluster_settings", cluster_view) run_command("/usr/share/clearwater/bin/reload_memcached_users")
def on_new_cluster_config_ready(self, cluster_view): run_command("service astaire reload") run_command("service astaire wait-sync")
def on_cluster_changing(self, cluster_view): write_chronos_cluster_settings("/etc/chronos/chronos_cluster.conf", cluster_view, self.local_server) run_command("service chronos reload")
def on_new_cluster_config_ready(self, cluster_view): run_command("service chronos resync") run_command("service chronos wait-sync")
def on_new_cluster_config_ready(self, cluster_view): # pragma: no cover self._alarm.set() run_command("service chronos resync") run_command("service chronos wait-sync")
def write_cluster_settings(self, cluster_view): write_chronos_cluster_settings("/etc/chronos/chronos_cluster.conf", cluster_view, self.local_server, self.instance_id, self.deployment_id) run_command(["service", "chronos", "reload"])
def can_contact_cassandra(self): rc = run_command(["/usr/share/clearwater/bin/poll_cassandra.sh", "--no-grace-period"], log_error=False) return (rc == 0)
def on_new_cluster_config_ready(self, cluster_view): # pragma: no cover self._alarm.set() run_command("service astaire reload") run_command("service astaire wait-sync")
def on_new_cluster_config_ready(self, cluster_view): # pragma: no cover self._alarm.set() run_command(["service", "astaire", "reload"]) run_command(["service", "astaire", "wait-sync"])
def write_cluster_settings(self, cluster_view): write_memcached_cluster_settings("/etc/clearwater/cluster_settings", cluster_view) run_command(["/usr/share/clearwater/bin/reload_memcached_users"])
def on_config_changed(self, value, alarm): _log.info("Updating dns configuration file") if self.status(value) != FileStatus.UP_TO_DATE: safely_write(_file, value) run_command("/usr/share/clearwater/bin/reload_dns_config")