def __init__(self, log_dir='logs'): """ @param log_dir: Top-level directory in which to write log files. A separate subdirectory will be created in this directory for the log files from this run. This can only be overridden by passing it to __init__() since that method creates the subdirectory. (default: logs) """ self.log_level = 'NOTICE' self.verbose = False self.transport = 'infrc' self.replicas = 3 self.disk = default_disk1 self.coordinator = None self.next_server_id = 0 self.next_client_id = 0 self.masters_started = 0 self.backups_started = 0 self.log_subdir = log.createDir(log_dir) self.sandbox = Sandbox()
def run( num_servers=4, # Number of hosts on which to start # servers (not including coordinator). backups_per_server=1, # Number of backups to run on each # server host (0, 1, or 2). replicas=3, # Replication factor to use for each # log segment. disk1=default_disk1, # Server arguments specifying the # backing device for the first backup # on each server. disk2=default_disk2, # Server arguments specifying the # backing device for the first backup # on each server (if backups_per_server= 2). timeout=20, # How many seconds to wait for the # clients to complete. coordinator_args='', # Additional arguments for the # coordinator. master_args='', # Additional arguments for each server # that runs a master backup_args='', # Additional arguments for each server # that runs a backup. log_level='NOTICE', # Log level to use for all servers. log_dir='logs', # Top-level directory in which to write # log files. A separate subdirectory # will be created in this directory # for the log files from this run. client='echo', # Command-line to invoke for each client # additional arguments will be prepended # with configuration information such as # -C. num_clients=1, # Number of client processes to run. # They will all run on separate # machines, if possible, but if there # aren't enough available machines then # multiple clients will run on some # machines. share_hosts=False, # True means clients can be run on # machines running servers, if needed. transport='infrc', # Name of transport to use for servers. verbose=False, # Print information about progress in # starting clients and servers debug=False, # If True, pause after starting all # to allow for debugging setup such as # attaching gdb. old_master_host=None, # Pass a (hostname, ip, id) tuple to # construct a large master on that host # before the others are started. Useful # for creating the old master for # recoveries. old_master_args="" # Additional arguments to run on the # old master (e.g. total RAM). ): """ Start a coordinator and servers, as indicated by the arguments. Then start one or more client processes and wait for them to complete. @return: string indicating the path to the log files for this run. """ if num_servers > len(hosts): raise Exception("num_servers (%d) exceeds the available hosts (%d)" % (num_servers, len(hosts))) # Create a subdirectory of the log directory for this run log_subdir = log.createDir(log_dir) coordinator = None servers = [] clients = [] with Sandbox() as sandbox: def ensure_servers(numMasters, numBackups): sandbox.checkFailures() try: sandbox.rsh( hosts[0][0], '%s -C %s -m %d -b %d -l 1 --wait 5 ' '--logFile %s/ensureServers.log' % (ensure_servers_bin, coordinator_locator, numMasters, numBackups, log_subdir)) except: # prefer exceptions from dead processes to timeout error sandbox.checkFailures() raise # Start coordinator if num_servers > 0: coordinator_host = hosts[0] coordinator_locator = coord_locator(transport, coordinator_host) coordinator = sandbox.rsh( coordinator_host[0], ('%s -C %s -l %s --logFile %s/coordinator.%s.log %s' % (coordinator_binary, coordinator_locator, log_level, log_subdir, coordinator_host[0], coordinator_args)), bg=True, stderr=subprocess.STDOUT) ensure_servers(0, 0) if verbose: print "Coordinator started on %s at %s" % (coordinator_host[0], coordinator_locator) # Track how many services are registered with the coordinator # for ensure_servers masters_started = 0 backups_started = 0 # Start old master - a specialized master for recovery with lots of data if old_master_host: host = old_master_host command = ('%s -C %s -L %s -M -r %d -l %s ' '--logFile %s/oldMaster.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host), replicas, log_level, log_subdir, host[0], old_master_args)) servers.append( sandbox.rsh(host[0], command, ignoreFailures=True, bg=True, stderr=subprocess.STDOUT)) masters_started += 1 ensure_servers(masters_started, 0) # Start servers for i in range(num_servers): # First start the main server on this host, which runs a master # and possibly a backup. The first server shares the same machine # as the coordinator. host = hosts[i] command = ('%s -C %s -L %s -r %d -l %s ' '--logFile %s/server.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host), replicas, log_level, log_subdir, host[0], master_args)) if backups_per_server > 0: command += ' %s %s' % (disk1, backup_args) masters_started += 1 backups_started += 1 else: command += ' -M' masters_started += 1 servers.append( sandbox.rsh(host[0], command, bg=True, stderr=subprocess.STDOUT)) if verbose: print "Server started on %s at %s" % ( host[0], server_locator(transport, host)) # Start an extra backup server in this host, if needed. if backups_per_server == 2: command = ( '%s -C %s -L %s -B %s -l %s ' '--logFile %s/backup.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host, second_backup_port), disk2, log_level, log_subdir, host[0], backup_args)) servers.append( sandbox.rsh(host[0], command, bg=True, stderr=subprocess.STDOUT)) backups_started += 1 if verbose: print "Extra backup started on %s at %s" % ( host[0], server_locator(transport, host, second_backup_port)) if debug: print "Servers started; pausing for debug setup." raw_input("Type <Enter> to continue: ") if masters_started > 0 or backups_started > 0: ensure_servers(masters_started, backups_started) if verbose: print "All servers running" # Start clients args = client.split(" ") client_bin = args[0] client_args = " ".join(args[1:]) host_index = num_servers for i in range(num_clients): if host_index >= len(hosts): if share_hosts or num_servers >= len(hosts): host_index = 0 else: host_index = num_servers client_host = hosts[host_index] command = ('%s -C %s --numClients %d --clientIndex %d ' '--logFile %s/client%d.%s.log %s' % (client_bin, coordinator_locator, num_clients, i, log_subdir, i, client_host[0], client_args)) clients.append(sandbox.rsh(client_host[0], command, bg=True)) if verbose: print "Client %d started on %s: %s" % (i, client_host[0], command) host_index += 1 # Wait for all of the clients to complete start = time.time() for i in range(num_clients): while clients[i].returncode is None: sandbox.checkFailures() time.sleep(.1) if time.time() - start > timeout: raise Exception('timeout exceeded') if verbose: print "Client %d finished" % i return log_subdir
def __init__(self, log_dir='logs', log_exists=False, cluster_name_exists=False): """ @param log_dir: Top-level directory in which to write log files. A separate subdirectory will be created in this directory for the log files from this run. This can only be overridden by passing it to __init__() since that method creates the subdirectory. (default: logs) @param log_exists: Indicates whether the log directory already exists. This will be true for cluster objects that are created after starting the clusterperf test. (default: False) @param cluster_name_exists: Indicates whether a cluster name already exists as part of this test. Backups that are started/restarted using the same cluster name will read data from the replicas (default: False) """ self.log_level = 'NOTICE' self.verbose = False self.transport = 'basic+infud' self.replicas = 3 self.disk = default_disk1 self.disjunct = False if cluster_name_exists: # do nothing if it exists self.cluster_name = None if self.verbose: print('Cluster name exists') else: self.cluster_name = 'cluster_' + ''.join([ chr(random.choice(range(ord('a'), ord('z')))) for c in range(20) ]) if self.verbose: print('Cluster name is %s' % (self.cluster_name)) self.coordinator = None self.next_server_id = 1 self.next_client_id = 1 self.masters_started = 0 self.backups_started = 0 if config.hooks.other_hosts: self.coordinator_host = config.hooks.other_hosts[0] else: self.coordinator_host = getHosts()[0] self.coordinator_locator = coord_locator(self.transport, self.coordinator_host) self.log_subdir = log.createDir(log_dir, log_exists) # Create a perfcounters directory under the log directory. os.mkdir(self.log_subdir + '/perfcounters') if not log_exists: self.sandbox = Sandbox() else: self.sandbox = Sandbox(cleanup=False) # create the shm directory to store shared files try: os.mkdir('%s/logs/shm' % os.getcwd()) except: pass f = open('%s/logs/shm/README' % os.getcwd(), 'w+') f.write('This directory contains files that correspond to' 'different server processes that were started during' 'the last run of clusterperf. Filename is\n' '"<hostname>_<pid>". Each of these files stores' 'the service locator of the respective server which is' 'used to give information to the client.\nThe existence' 'of this file at the end of a clusterperf run means' 'that processes were not cleaned up properly the last' ' time. So one can use these pids during manual clean up') if not cluster_name_exists: # store the name of the cluster by creating an empty file with # the appropriate file name in shm so that new backups when # created using a different cluster object can use it to read # data from their disks f = open('%s/logs/shm/%s' % (os.getcwd(), self.cluster_name), 'w+')
def __init__(self, log_dir='logs', log_exists=False, cluster_name_exists=False): """ @param log_dir: Top-level directory in which to write log files. A separate subdirectory will be created in this directory for the log files from this run. This can only be overridden by passing it to __init__() since that method creates the subdirectory. (default: logs) @param log_exists: Indicates whether the log directory already exists. This will be true for cluster objects that are created after starting the clusterperf test. (default: False) @param cluster_name_exists: Indicates whether a cluster name already exists as part of this test. Backups that are started/restarted using the same cluster name will read data from the replicas (default: False) """ self.log_level = 'NOTICE' self.verbose = False self.transport = 'infrc' self.replicas = 3 self.disk = default_disk1 self.disjunct = False if cluster_name_exists: # do nothing if it exists self.cluster_name = None if self.verbose: print ('Cluster name exists') else: self.cluster_name = 'cluster_' + ''.join([chr(random.choice( range(ord('a'), ord('z')))) for c in range(20)]) if self.verbose: print ('Cluster name is %s' % (self.cluster_name)) self.coordinator = None self.next_server_id = 1 self.next_client_id = 1 self.masters_started = 0 self.backups_started = 0 self.coordinator_host= hosts[0] self.coordinator_locator = coord_locator(self.transport, self.coordinator_host) self.log_subdir = log.createDir(log_dir, log_exists) # Create a perfcounters directory under the log directory. os.mkdir(self.log_subdir + '/perfcounters') if not log_exists: self.sandbox = Sandbox() else: self.sandbox = Sandbox(cleanup=False) # create the shm directory to store shared files try: os.mkdir('%s/logs/shm' % os.getcwd()) except: pass f = open('%s/logs/shm/README' % os.getcwd(), 'w+') f.write('This directory contains files that correspond to' 'different server processes that were started during' 'the last run of clusterperf. Filename is\n' '"<hostname>_<pid>". Each of these files stores' 'the service locator of the respective server which is' 'used to give information to the client.\nThe existence' 'of this file at the end of a clusterperf run means' 'that processes were not cleaned up properly the last' ' time. So one can use these pids during manual clean up') if not cluster_name_exists: # store the name of the cluster by creating an empty file with # the appropriate file name in shm so that new backups when # created using a different cluster object can use it to read # data from their disks f = open('%s/logs/shm/%s' % (os.getcwd(), self.cluster_name), 'w+')
def run( num_servers=4, # Number of hosts on which to start # servers (not including coordinator). backups_per_server=1, # Number of backups to run on each # server host (0, 1, or 2). replicas=3, # Replication factor to use for each # log segment. disk1=default_disk1, # Server arguments specifying the # backing device for the first backup # on each server. disk2=default_disk2, # Server arguments specifying the # backing device for the first backup # on each server (if backups_per_server= 2). timeout=20, # How many seconds to wait for the # clients to complete. coordinator_args='', # Additional arguments for the # coordinator. master_args='', # Additional arguments for each server # that runs a master backup_args='', # Additional arguments for each server # that runs a backup. log_level='NOTICE', # Log level to use for all servers. log_dir='logs', # Top-level directory in which to write # log files. A separate subdirectory # will be created in this directory # for the log files from this run. client='echo', # Command-line to invoke for each client # additional arguments will be prepended # with configuration information such as # -C. num_clients=1, # Number of client processes to run. # They will all run on separate # machines, if possible, but if there # aren't enough available machines then # multiple clients will run on some # machines. share_hosts=False, # True means clients can be run on # machines running servers, if needed. transport='infrc', # Name of transport to use for servers. verbose=False, # Print information about progress in # starting clients and servers debug=False, # If True, pause after starting all # to allow for debugging setup such as # attaching gdb. old_master_host=None, # Pass a (hostname, ip, id) tuple to # construct a large master on that host # before the others are started. Useful # for creating the old master for # recoveries. old_master_args="" # Additional arguments to run on the # old master (e.g. total RAM). ): """ Start a coordinator and servers, as indicated by the arguments. Then start one or more client processes and wait for them to complete. @return: string indicating the path to the log files for this run. """ if num_servers > len(hosts): raise Exception("num_servers (%d) exceeds the available hosts (%d)" % (num_servers, len(hosts))) # Create a subdirectory of the log directory for this run log_subdir = log.createDir(log_dir) coordinator = None servers = [] clients = [] with Sandbox() as sandbox: def ensure_servers(numMasters, numBackups): sandbox.checkFailures() try: sandbox.rsh(hosts[0][0], '%s -C %s -m %d -b %d -l 1 --wait 5 ' '--logFile %s/ensureServers.log' % (ensure_servers_bin, coordinator_locator, numMasters, numBackups, log_subdir)) except: # prefer exceptions from dead processes to timeout error sandbox.checkFailures() raise # Start coordinator if num_servers > 0: coordinator_host = hosts[0] coordinator_locator = coord_locator(transport, coordinator_host) coordinator = sandbox.rsh(coordinator_host[0], ('%s -C %s -l %s --logFile %s/coordinator.%s.log %s' % (coordinator_binary, coordinator_locator, log_level, log_subdir, coordinator_host[0], coordinator_args)), bg=True, stderr=subprocess.STDOUT) ensure_servers(0, 0) if verbose: print "Coordinator started on %s at %s" % (coordinator_host[0], coordinator_locator) # Track how many services are registered with the coordinator # for ensure_servers masters_started = 0 backups_started = 0 # Start old master - a specialized master for recovery with lots of data if old_master_host: host = old_master_host command = ('%s -C %s -L %s -M -r %d -l %s ' '--logFile %s/oldMaster.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host), replicas, log_level, log_subdir, host[0], old_master_args)) servers.append(sandbox.rsh(host[0], command, ignoreFailures=True, bg=True, stderr=subprocess.STDOUT)) masters_started += 1 ensure_servers(masters_started, 0) # Start servers for i in range(num_servers): # First start the main server on this host, which runs a master # and possibly a backup. The first server shares the same machine # as the coordinator. host = hosts[i]; command = ('%s -C %s -L %s -r %d -l %s ' '--logFile %s/server.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host), replicas, log_level, log_subdir, host[0], master_args)) if backups_per_server > 0: command += ' %s %s' % (disk1, backup_args) masters_started += 1 backups_started += 1 else: command += ' -M' masters_started += 1 servers.append(sandbox.rsh(host[0], command, bg=True, stderr=subprocess.STDOUT)) if verbose: print "Server started on %s at %s" % (host[0], server_locator(transport, host)) # Start an extra backup server in this host, if needed. if backups_per_server == 2: command = ('%s -C %s -L %s -B %s -l %s ' '--logFile %s/backup.%s.log %s' % (server_binary, coordinator_locator, server_locator(transport, host, second_backup_port), disk2, log_level, log_subdir, host[0], backup_args)) servers.append(sandbox.rsh(host[0], command, bg=True, stderr=subprocess.STDOUT)) backups_started += 1 if verbose: print "Extra backup started on %s at %s" % (host[0], server_locator(transport, host, second_backup_port)) if debug: print "Servers started; pausing for debug setup." raw_input("Type <Enter> to continue: ") if masters_started > 0 or backups_started > 0: ensure_servers(masters_started, backups_started) if verbose: print "All servers running" # Start clients args = client.split(" ") client_bin = args[0] client_args = " ".join(args[1:]) host_index = num_servers for i in range(num_clients): if host_index >= len(hosts): if share_hosts or num_servers >= len(hosts): host_index = 0 else: host_index = num_servers client_host = hosts[host_index] command = ('%s -C %s --numClients %d --clientIndex %d ' '--logFile %s/client%d.%s.log %s' % (client_bin, coordinator_locator, num_clients, i, log_subdir, i, client_host[0], client_args)) clients.append(sandbox.rsh(client_host[0], command, bg=True)) if verbose: print "Client %d started on %s: %s" % (i, client_host[0], command) host_index += 1 # Wait for all of the clients to complete start = time.time() for i in range(num_clients): while clients[i].returncode is None: sandbox.checkFailures() time.sleep(.1) if time.time() - start > timeout: raise Exception('timeout exceeded') if verbose: print "Client %d finished" % i return log_subdir