def node_reboot(self, ip): rpc(ip, 'sudo reboot now', self.username, self.password, self.key, timeout=60 * 2)
def db_add_random_node(self, wait_time_min=0, randomness_time_injection=0): """ Removes node from db instantly and then "adds" the node again at the given time. :param db: :param wait_time_min: :param randomness_time_injection: :return: """ assert (wait_time_min >= 0) assert (randomness_time_injection >= 0) pick = pick_x_different_num(1, 0, len(self.cassandra.ips) - 1)[0] ip = self.cassandra.ips[pick] down_time = wait_time_min * 60 + random.randint( 0, randomness_time_injection) note = 'Chose {%s} node to be added after %s seconds.' % (ip, down_time) add_test_note(note) cmd = '(nohup sudo ifdown eth0; sleep %s ; sudo ifup eth0 ; sudo reboot now) > /tmp/datos_failure.log &' % ( down_time) # disable eth0 then reboot at end to simulate failure. rpc(ip, cmd, self.cassandra.username, self.cassandra.password, self.cassandra.key) ip2 = self.cassandra.ips[(pick + 1) % len(self.cassandra.ips)] cmd = 'nodetool removenode %s' % ip rpc(ip2, cmd, self.cassandra.username, self.cassandra.password, self.cassandra.key)
def db_start(self, ip): rpc(ip, 'sudo service cassandra start', self.username, self.password, self.key, timeout=60 * 2)
def node_shutdown(self, ip): rpc(ip, 'sudo halt', self.username, self.password, self.key, timeout=60 * 2)
def delta_worker(): # Loop every 5 minutes and reinitialize delta. while self.do_delta_population: # Stop previous populations, in the case they are still going. rpc(workload_ip, cmd1, self.username, self.password, self.key) time.sleep(2) # Start new batch of populations. rpc(workload_ip, cmd2, self.username, self.password, self.key, no_tty=True) # No tty so we can run as bg & disconnect. report('{%s} delta population set on node %s.' % (mgmt_object, workload_ip)) time.sleep(60 * LOOP_MIN) # Sleep LOOP_MIN min, allow delta to complete and settle, then cycle again. (A more dependable way)
def shutdown(self): """ Shutdown the whole db cluster. """ for ip in self.ips: rpc(ip, 'sudo service cassandra stop', self.username, self.password, self.key, timeout=60 * 2)
def cfstats(self): # Cycle through the nodes until we get a result from nodetool cfstats. for ip in self.ips: out, _ = rpc(ip, "nodetool cfstats", self.username, self.password, self.key) # TODO: (Aaron) finish ... return out
def _deliver_payload(self): """ Delivers population scripts and other goodies to the cassandra source cluster. Most stored in ~/.geppetto/ """ common_script_path = '%s/common/common.py' % ( common.common.global_vars['geppetto_install_dir']) population_script_path = '%s/db_utils/cassandra_utils/data_population.py' % ( common.common.global_vars['geppetto_install_dir']) schema_folder_path = '%s/db_utils/cassandra_utils/schema' % ( common.common.global_vars['geppetto_install_dir']) for ip in self.ips: report('Updating Geppetto payload on {%s}.' % ip) to_path = '%s@%s:~/.geppetto/' % (self.username, ip) # rpc(ip, 'rm -rf ~/.geppetto', self.username, self.password, self.key, suppress_output=True) rpc(ip, 'mkdir -p ~/.geppetto/common', self.username, self.password, self.key, suppress_output=True) rpc(ip, 'touch ~/.geppetto/common/__init__.py', self.username, self.password, self.key, suppress_output=True) scp(common_script_path, '%s/common/' % to_path, self.password, self.key, suppress_output=True) scp(population_script_path, to_path, self.password, self.key, suppress_output=True) scp(schema_folder_path, to_path, self.password, self.key, is_dir=True, suppress_output=True) self.payload = True return True
def nodetool_status(self): # Cycle through the nodes until we get a result from nodetool status. for ip in self.source_params['ips']: out, err = rpc(ip, "nodetool status | grep 'UN\|UL\|UJ\|UM\|DN\|DL\|DJ\|DM\|===='", self.username, self.password, self.key, suppress_output=True) if any(x in out for x in ['UN', 'UL', 'UJ', 'UM', 'DN', 'DL', 'DJ', 'DM']): return out response = pause_execution_for_input('No status received from Cassandra Nodetool', level='info') if response == 'r': self.nodetool_status()
def get_compaction_history(self): cmd = 'nodetool compactionhistory' for ip in self.ips: try: out, err = rpc() return out except: pass return ''
def _deliver_payload(self): """ Delivers population scripts and other goodies to the cassandra source cluster. Most stored in ~/.geppetto/ """ common_script_path = '%s/common/common.py' % (common.common.global_vars['geppetto_install_dir']) population_script_path = '%s/db_utils/cassandra_utils/data_population.py' % (common.common.global_vars['geppetto_install_dir']) schema_folder_path = '%s/db_utils/cassandra_utils/schema' % (common.common.global_vars['geppetto_install_dir']) for ip in self.ips: report('Updating Geppetto payload on {%s}.' % ip) to_path = '%s@%s:~/.geppetto/' % (self.username, ip) # rpc(ip, 'rm -rf ~/.geppetto', self.username, self.password, self.key, suppress_output=True) rpc(ip, 'mkdir -p ~/.geppetto/common', self.username, self.password, self.key, suppress_output=True) rpc(ip, 'touch ~/.geppetto/common/__init__.py', self.username, self.password, self.key, suppress_output=True) scp(common_script_path, '%s/common/' % to_path, self.password, self.key, suppress_output=True) scp(population_script_path, to_path, self.password, self.key, suppress_output=True) scp(schema_folder_path, to_path, self.password, self.key, is_dir=True, suppress_output=True) self.payload = True return True
def delta_worker(): # Loop every 5 minutes and reinitialize delta. while self.do_delta_population: # Stop previous populations, in the case they are still going. rpc(workload_ip, cmd1, self.username, self.password, self.key) time.sleep(2) # Start new batch of populations. rpc(workload_ip, cmd2, self.username, self.password, self.key, no_tty=True) # No tty so we can run as bg & disconnect. report('{%s} delta population set on node %s.' % (mgmt_object, workload_ip)) time.sleep( 60 * LOOP_MIN ) # Sleep LOOP_MIN min, allow delta to complete and settle, then cycle again. (A more dependable way)
def clean(self): """ Caution! Empties database directories and commit logs for all nodes in db. :return: """ report( 'Cleaning data and commitlog directories for cluster {%s}' % (self.name), 'warning') cmd = 'sudo service cassandra stop' for ip in self.ips: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(10) cmd_list = [ 'rm -f ~/.__jmxcmd*', 'sudo rm -rf %s/*' % self.data_dir, 'sudo rm -rf %s/*' % self.commitlog_dir, 'sudo service cassandra start', ] for ip in self.ips[:1]: for cmd in cmd_list: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(30) for ip in self.ips[1:]: for cmd in cmd_list: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(30) report('Status cluster {%s} \n %s' % (self.name, self.status()))
def clean(self): """ Caution! Empties database directories and commit logs for all nodes in db. :return: """ report('Cleaning data and commitlog directories for cluster {%s}' % (self.name), 'warning') cmd = 'sudo service cassandra stop' for ip in self.ips: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(10) cmd_list = [ 'rm -f ~/.__jmxcmd*', 'sudo rm -rf %s/*' % self.data_dir, 'sudo rm -rf %s/*' % self.commitlog_dir, 'sudo service cassandra start', ] for ip in self.ips[:1]: for cmd in cmd_list: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(30) for ip in self.ips[1:]: for cmd in cmd_list: rpc(ip, cmd, self.username, self.password, self.key) time.sleep(30) report('Status cluster {%s} \n %s' % (self.name, self.status()))
def mass_worker(): record_count_per_node = int(record_count / len(population_ips)) node_start_record = start_record auth_string = '' if self.db_user: auth_string = '--db_user %s --db_pass %s' % (self.db_user, self.db_pass) for ip in population_ips: report('Setting mass population on cluster {%s} node {%s}.' % (self.name, ip), 'warning') # Clean log first. cmd = 'sudo rm /tmp/mass_population.log' rpc(ip, cmd, self.username, self.password, self.key) cmd = '(python ~/.geppetto/data_population.py ' \ '%s %s %s ' \ 'insert ' \ '-r %s ' \ '-s %s ' \ '-n %s ' \ '-t %s ' \ '--replication %s ' \ ') > /tmp/mass_population.log &' % \ (ip, schema_file, auth_string, record_size, node_start_record, record_count_per_node, mgmt_object, replication) node_start_record += record_count_per_node rpc(ip, cmd, self.username, self.password, self.key, no_tty=True) # No tty so we can run as bg & disconnect. if not async: cmd = 'ps -ef | grep geppetto | grep -v grep | wc -l' cmd2 = 'tail -1 /tmp/mass_population.log' while True: try: report('Populating ...') processes_running = 0 for ip in population_ips: out, err = rpc(ip, cmd, self.username, self.password, self.key, suppress_output=True) out2, err2 = rpc(ip, cmd2, self.username, self.password, self.key, suppress_output=True) report('<%s> %s' % (ip, out2)) try: processes_running += int(out) except Exception as e: report(e, 'critical') raise if processes_running == 0: break except Exception as e: report(e, 'critical') break time.sleep(15)
def db_remove_random_node(self, wait_time_min=0, run_time_min=10, randomness_time_injection=0): """ Removes node from cassandra cluster for given time. :param db: :param wait_time_min: :param run_time_min: :param randomness_time_injection: :return: """ assert (wait_time_min >= 0) assert (run_time_min >= 0) assert (randomness_time_injection >= 0) time.sleep(wait_time_min * 60) time.sleep(random.randint(0, randomness_time_injection)) pick = pick_x_different_num(1, 0, len(self.cassandra.ips) - 1)[0] ip = self.cassandra.ips[pick] down_time = int(max( 0, run_time_min * 60 - 45)) # We subtract about 45 seconds due to reboot time at the end. note = 'Chose {%s} node to be removed for %s seconds.' % (ip, down_time) add_test_note(note) cmd = '(nohup sudo ifdown eth0; sleep %s ; sudo ifup eth0 ; sudo reboot now) > /tmp/datos_failure.log &' % ( down_time) # disable eth0 then reboot at end to simulate failure. rpc(ip, cmd, self.cassandra.username, self.cassandra.password, self.cassandra.key) ip2 = self.cassandra.ips[(pick + 1) % len(self.cassandra.ips)] cmd = 'nodetool removenode %s' % ip rpc(ip2, cmd, self.cassandra.username, self.cassandra.password, self.cassandra.key)
def nodetool_status(self): # Cycle through the nodes until we get a result from nodetool status. for ip in self.source_params['ips']: out, err = rpc( ip, "nodetool status | grep 'UN\|UL\|UJ\|UM\|DN\|DL\|DJ\|DM\|===='", self.username, self.password, self.key, suppress_output=True) if any(x in out for x in ['UN', 'UL', 'UJ', 'UM', 'DN', 'DL', 'DJ', 'DM']): return out response = pause_execution_for_input( 'No status received from Cassandra Nodetool', level='info') if response == 'r': self.nodetool_status()
def single_random_node_failure(self, wait_time_min=0, run_time_min=10, time_length_of_failure=5, max_failure_repeats=1, randomness_time_injection=90): """ Shuts down a random cassandra node for a given time with some randomness thrown in for timing. """ assert (wait_time_min >= 0) assert (run_time_min >= 0.1) assert (time_length_of_failure >= 0.2) assert (max_failure_repeats > 0) assert (randomness_time_injection >= 0) self.cassandra.status() start_time = time.time() time.sleep(60 * wait_time_min) pick = pick_x_different_num(1, 0, len(self.cassandra.ips) - 1)[0] for _ in xrange(max_failure_repeats): time.sleep(random.randint( 0, randomness_time_injection)) # Randomize time that db fails. currently_down = collections.deque() try: # Bring down the db node for some time. currently_down.append(pick) try: note = '' add_test_note(note) rpc( self.cassandra.ips[pick], '(nohup sudo ifdown eth0; sleep %s ; sudo ifup eth0 ; ) > /tmp/datos_failure.log &' % (time_length_of_failure * 60), self.cassandra.username, self.cassandra.password, self.cassandra.key) except: report('Could not connect to node {%s}.' % db.ips[pick], 'warning') self.cassandra.status() # Let's see the db state. time.sleep(60 * time_length_of_failure + 60) # Bring db node back up. self.cassandra.node_restore( self.cassandra.ips[pick] ) # Currently we don't have good way to restore so this does nothing. currently_down.popleft() time.sleep(20) self.cassandra.status() except (KeyboardInterrupt, SystemExit) as e: # Do some clean up (restore db nodes) and some reporting, then re-raise exception. report('Exit detected ... restoring db state', 'critical') for i in currently_down: self.cassandra.node_restore(db.ips[pick]) time.sleep(20) self.cassandra.status() # Logs will capture output. global global_vars global_vars['test_status'] = 'Aborted' add_test_note(e) raise e # Exit failure loop if we've reached max time. if (time.time() + wait_time_min * 60 - start_time >= run_time_min * 60): break
def stop_mass_population(self): self.do_mass_population = False cmd = '''ps -ef | grep -v grep | grep geppetto | awk '{print $2}' | xargs kill -9''' for ip in self.ips: rpc(ip, cmd, self.username, self.password, self.key)
def mass_worker(): record_count_per_node = int(record_count / len(population_ips)) node_start_record = start_record auth_string = '' if self.db_user: auth_string = '--db_user %s --db_pass %s' % (self.db_user, self.db_pass) for ip in population_ips: report( 'Setting mass population on cluster {%s} node {%s}.' % (self.name, ip), 'warning') # Clean log first. cmd = 'sudo rm /tmp/mass_population.log' rpc(ip, cmd, self.username, self.password, self.key) cmd = '(python ~/.geppetto/data_population.py ' \ '%s %s %s ' \ 'insert ' \ '-r %s ' \ '-s %s ' \ '-n %s ' \ '-t %s ' \ '--replication %s ' \ ') > /tmp/mass_population.log &' % \ (ip, schema_file, auth_string, record_size, node_start_record, record_count_per_node, mgmt_object, replication) node_start_record += record_count_per_node rpc(ip, cmd, self.username, self.password, self.key, no_tty=True) # No tty so we can run as bg & disconnect. if not async: cmd = 'ps -ef | grep geppetto | grep -v grep | wc -l' cmd2 = 'tail -1 /tmp/mass_population.log' while True: try: report('Populating ...') processes_running = 0 for ip in population_ips: out, err = rpc(ip, cmd, self.username, self.password, self.key, suppress_output=True) out2, err2 = rpc(ip, cmd2, self.username, self.password, self.key, suppress_output=True) report('<%s> %s' % (ip, out2)) try: processes_running += int(out) except Exception as e: report(e, 'critical') raise if processes_running == 0: break except Exception as e: report(e, 'critical') break time.sleep(15)
def db_start(self, ip): rpc(ip, 'sudo service cassandra start', self.username, self.password, self.key, timeout=60*2)
def node_reboot(self, ip): rpc(ip, 'sudo reboot now', self.username, self.password, self.key, timeout=60*2)
def node_shutdown(self, ip): rpc(ip, 'sudo halt', self.username, self.password, self.key, timeout=60*2)
def shutdown(self): """ Shutdown the whole db cluster. """ for ip in self.ips: rpc(ip, 'sudo service cassandra stop', self.username, self.password, self.key, timeout=60*2)