def install(self): Print.info('Installing rust and cloning the repo...') cmd = [ 'sudo apt-get update', 'sudo apt-get -y upgrade', 'sudo apt-get -y autoremove', # The following dependencies prevent the error: [error: linker `cc` not found]. 'sudo apt-get -y install build-essential', 'sudo apt-get -y install cmake', # Install rust (non-interactive). 'curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y', 'source $HOME/.cargo/env', 'rustup default stable', # This is missing from the Rocksdb installer (needed for Rocksdb). 'sudo apt-get install -y clang', # Clone the repo. f'(git clone {self.settings.repo_url} || (cd {self.settings.repo_name} ; git pull))' ] hosts = self.manager.hosts(flat=True) try: g = Group(*hosts, user='******', connect_kwargs=self.connect) g.run(' && '.join(cmd), hide=True) Print.heading(f'Initialized testbed of {len(hosts)} nodes') except (GroupException, ExecutionError) as e: e = FabricError(e) if isinstance(e, GroupException) else e raise BenchError('Failed to install repo on testbed', e)
def _config(self, hosts, node_parameters): Print.info('Generating configuration files...') # Cleanup all local configuration files. cmd = CommandMaker.cleanup() subprocess.run([cmd], shell=True, stderr=subprocess.DEVNULL) # Recompile the latest code. cmd = CommandMaker.compile().split() subprocess.run(cmd, check=True, cwd=PathMaker.node_crate_path()) # Create alias for the client and nodes binary. cmd = CommandMaker.alias_binaries(PathMaker.binary_path()) subprocess.run([cmd], shell=True) # Generate configuration files. keys = [] key_files = [PathMaker.key_file(i) for i in range(len(hosts))] for filename in key_files: cmd = CommandMaker.generate_key(filename).split() subprocess.run(cmd, check=True) keys += [Key.from_file(filename)] # Generate threshold signature files. nodes = len(hosts) cmd = './node threshold_keys' for i in range(nodes): cmd += ' --filename ' + PathMaker.threshold_key_file(i) cmd = cmd.split() subprocess.run(cmd, capture_output=True, check=True) names = [x.name for x in keys] consensus_addr = [f'{x}:{self.settings.consensus_port}' for x in hosts] front_addr = [f'{x}:{self.settings.front_port}' for x in hosts] tss_keys = [] for i in range(nodes): tss_keys += [TSSKey.from_file(PathMaker.threshold_key_file(i))] ids = [x.id for x in tss_keys] mempool_addr = [f'{x}:{self.settings.mempool_port}' for x in hosts] committee = Committee(names, ids, consensus_addr, front_addr, mempool_addr) committee.print(PathMaker.committee_file()) node_parameters.print(PathMaker.parameters_file()) # Cleanup all nodes. cmd = f'{CommandMaker.cleanup()} || true' g = Group(*hosts, user='******', connect_kwargs=self.connect) g.run(cmd, hide=True) # Upload configuration files. progress = progress_bar(hosts, prefix='Uploading config files:') for i, host in enumerate(progress): c = Connection(host, user='******', connect_kwargs=self.connect) c.put(PathMaker.committee_file(), '.') c.put(PathMaker.key_file(i), '.') c.put(PathMaker.threshold_key_file(i), '.') c.put(PathMaker.parameters_file(), '.') return committee
def run_commands(connections: ThreadingGroup, commands_file) -> None: """ Run commands on each remote host and prefix the output (stdout + stderr) with the hostname. **Note about the host prefix** Cf. http://www.fabfile.org/upgrading.html, the "env.output_prefix" option has not been ported to fabric 2.X We by-pass this by writing the current hostname to a file and we prefix each output of the commands run with the hostname :param connections: :param commands_file: :return: """ connection: Connection for connection in connections: connection.run('echo "{host}" > /tmp/evaneos_ssh__fabric_host'.format( host=connection.host)) try: connections.run( '{commands} 2>&1 | sed "s/^/[$(cat /tmp/evaneos_ssh__fabric_host)] /"' .format(commands=commands_file)) except GroupException: sys.exit(1) finally: remote_cleanup(connections, commands_file)
def excepted_command(self): group = Group('nopebadhost1', 'nopebadhost2') try: group.run('lolnope', hide=True) except GroupException as e: for value in e.result.values(): ok_(isinstance(value, gaierror)) else: assert False, "Did not raise GroupException!"
def excepted_command(self): group = Group("nopebadhost1", "nopebadhost2") try: group.run("lolnope", hide=True) except GroupException as e: for value in e.result.values(): assert isinstance(value, gaierror) else: assert False, "Did not raise GroupException!"
def excepted_command(self): group = Group('nopebadhost1', 'nopebadhost2') try: group.run('lolnope', hide=True) except GroupException as e: for value in e.result.values(): assert isinstance(value, gaierror) else: assert False, "Did not raise GroupException!"
def remote_cleanup(connections: ThreadingGroup, commands_file: str) -> None: """ Cleans up remote hosts' filesystem. :param connections: The remote connections :param commands_file: The command file to delete """ connections.run('rm {file}'.format(file=commands_file)) connections.run( 'rm /tmp/evaneos_ssh__fabric_host'.format(file=commands_file))
def kill(self, hosts=[], delete_logs=False): assert isinstance(hosts, list) assert isinstance(delete_logs, bool) hosts = hosts if hosts else self.manager.hosts(flat=True) delete_logs = CommandMaker.clean_logs() if delete_logs else 'true' cmd = [delete_logs, f'({CommandMaker.kill()} || true)'] try: g = Group(*hosts, user='******', connect_kwargs=self.connect) g.run(' && '.join(cmd), hide=True) except GroupException as e: raise BenchError('Failed to kill nodes', FabricError(e))
def failed_command(self): group = Group('localhost', '127.0.0.1') try: group.run('lolnope', hide=True) except GroupException as e: # GroupException.result -> GroupResult; # GroupResult values will be UnexpectedExit in this case; # UnexpectedExit.result -> Result, and thus .exited etc. exits = [x.result.exited for x in e.result.values()] assert [127, 127] == exits else: assert False, "Did not raise GroupException!"
def failed_command(self): group = Group("localhost", "127.0.0.1") try: group.run("lolnope", hide=True) except GroupException as e: # GroupException.result -> GroupResult; # GroupResult values will be UnexpectedExit in this case; # UnexpectedExit.result -> Result, and thus .exited etc. exits = [x.result.exited for x in e.result.values()] assert [127, 127] == exits else: assert False, "Did not raise GroupException!"
def failed_command(self): group = Group('localhost', '127.0.0.1') try: group.run('lolnope', hide=True) except GroupException as e: # GroupException.result -> GroupResult; # GroupResult values will be UnexpectedExit in this case; # UnexpectedExit.result -> Result, and thus .exited etc. eq_( [x.result.exited for x in e.result.values()], [127, 127], ) else: assert False, "Did not raise GroupException!"
def _update(self, hosts): Print.info( f'Updating {len(hosts)} nodes (branch "{self.settings.branch}")...' ) cmd = [ f'(cd {self.settings.repo_name} && git fetch -f)', f'(cd {self.settings.repo_name} && git checkout -f {self.settings.branch})', f'(cd {self.settings.repo_name} && git pull -f)', 'source $HOME/.cargo/env', f'(cd {self.settings.repo_name}/node && {CommandMaker.compile()})', CommandMaker.alias_binaries( f'./{self.settings.repo_name}/target/release/') ] g = Group(*hosts, user='******', connect_kwargs=self.connect) g.run(' && '.join(cmd), hide=True)
def simple_command(self): group = Group('localhost', '127.0.0.1') result = group.run('echo foo', hide=True) eq_( [x.stdout.strip() for x in result.values()], ['foo', 'foo'], )
def run_all2one(bramble, server_ip, niter=niter): server = Connection(server_ip, user='******', connect_kwargs=cxn_args) ips = [c.host for c in bramble if c.host != server_ip] clients = ThreadingGroup(*ips, user='******', connect_kwargs=cxn_args) print(f"Begin {len(clients)} clients to 1 server experiment") server.run("killall -q iperf", warn=True) time.sleep(10) # wait for old process to die server.run(f"iperf -s > {remote_output_dir}/server.log &") for i in range(niter): print(f"Iteration {i}") clients.run("killall -q iperf", warn=True) time.sleep(10) # wait for processes to die clients.run( f"iperf -P 20 -c {server.host} >> {remote_output_dir}/client.log") gather_all2one_results(clients, server)
def upload_command_file_to_remotes(connections: ThreadingGroup) -> str: """ Upload the command file to execute remotely on each remote host. :param connections: The remote connections :return: The remote path of the command file """ remote_commands_file = '/tmp/{base_filename}.sh'.format( base_filename=str(uuid.uuid4())) connection: Connection for connection in connections: connection.put('/var/local/github-actions/commands', remote_commands_file) connections.run('chmod a+x {file}'.format(file=remote_commands_file)) return remote_commands_file
def test_hosts_are_reachable(connections_to_test: ThreadingGroup) -> None: """ Run a dummy command on remote host to make sure they are reachable. If an issue arise at connection, display the exception for each connection in error. :param connections_to_test: The connections we make sure are reachable """ try: connections_to_test.run('echo "dry-run"', hide=True) except GroupException as error: result: Connection for error_connection, error in error.result.items(): if isinstance(error, Exception): print_github_action_error( 'Error on "{host}": {message}'.format( host=error_connection.host, message=str(error))) sys.exit(1)
def simple_command(self): group = Group("localhost", "127.0.0.1") result = group.run("echo foo", hide=True) outs = [x.stdout.strip() for x in result.values()] assert ["foo", "foo"] == outs
class RemoteFlowResources(Resources): """ """ def __init__(self, user, hosts, max_workers=100, submission_root=None, prolog="", rc=None): """ """ if not isinstance(hosts, dict): hosts = {host: {} for host in hosts} self.user = user self.max_workers = max_workers self.prolog = prolog self.hosts = hosts self.submission_root = submission_root self.rc = rc self.connections = Group(*list(sorted(hosts.keys())), user=user) for connection in self.connections: connection.connect_timeout = 360 def init(self): for host_name, host_connection in zip(sorted(self.hosts.keys()), self.connections): print('Adding source remoteflowrc in {}:bashrc'.format(host_name)) try: patchwork.files.append(host_connection, '.bashrc', 'source $HOME/.remoteflowrc') except paramiko.ssh_exception.AuthenticationException as e: print(e) continue rc_file_path = self.hosts.get('rc', self.rc) if rc_file_path is None: raise ValueError( "`rc` is not defined for host {} nor globally.".format( host_name)) try: out = host_connection.run('cat .remoteflowrc', hide=True) # [:-2] is a dirty hack to drop the line `export CLUSTERNAME={}` if '\n'.join(out.stdout.split('\n')[:-2]) != open( rc_file_path, 'r').read(): update_rc = True except invoke.exceptions.UnexpectedExit: update_rc = True if update_rc: print('Updating {}:.remoteflowrc'.format(host_name)) host_connection.put(rc_file_path) patchwork.files.append( host_connection, '.remoteflowrc', 'export CLUSTERNAME={}'.format(host_name)) print('Updating {}:.config/mahler'.format(host_name)) patchwork.transfers.rsync( host_connection, '{}/.remoteconfig/mahler'.format(os.environ['HOME']), '.config') # if not patchwork.environment.have_program(host_connection, 'flow-submit'): print('Installing flow on {}'.format(host_name)) host_connection.run( 'pip install --upgrade --user git+https://github.com/bouthilx/flow.git', hide=True) # if not patchwork.environment.have_program(host_connection, 'sregistry'): print('Installing sregistry on {}'.format(host_name)) host_connection.run('pip install --upgrade --user sregistry[all]', hide=True) def available(self, squash=True): """ """ command = 'squeue -r -o %t -u {user}'.format(user=self.user) jobs = {} max_workers = 0 result = self.connections.run(command, hide=True, warn=True) for host_name in sorted(self.hosts.keys()): logger.debug('squeue on {}'.format(host_name)) max_workers += self.hosts[host_name].get('max_workers', self.max_workers) out = result[self.connections[list(sorted( self.hosts.keys())).index(host_name)]] out = out.stdout states = dict() for line in out.split("\n")[1:]: # ignore `ST` header line = line.strip() if not line: continue if line not in states: states[line] = 0 states[line] += 1 logger.debug('Nodes availability') for state, number in sorted(states.items()): logging.debug('{}: {}'.format(state, number)) jobs[host_name] = sum(number for name, number in states.items() if name != 'CG') logging.debug('total: {}'.format(jobs[host_name])) if squash: return max(max_workers - sum(jobs.values()), 0) else: return { host_name: host_config.get('max_workers', self.max_workers) - jobs[host_name] for host_name, host_config in self.hosts.items() } def run(self, commandline): commandline = " ".join(commandline) print('executing:\n' + commandline) result = self.connections.run(commandline, hide=True, warn=True) status = dict() for host_name in sorted(self.hosts.keys()): print(" {} ".format(host_name)) print('-' * (len(host_name) + 4)) out = result[self.connections[list(sorted( self.hosts.keys())).index(host_name)]] print('\nstdout') print('------') print(out.stdout) print('\nstderr') print('------') print(out.stderr) def info(self): command = 'squeue -r -o %t -u {user}'.format(user=self.user) jobs = {} max_workers = 0 result = self.connections.run(command, hide=True) status = dict() for host_name in self.hosts.keys(): logger.debug('squeue on {}'.format(host_name)) max_workers += self.hosts[host_name].get('max_workers', self.max_workers) out = result[self.connections[list(sorted( self.hosts.keys())).index(host_name)]] out = out.stdout states = dict() for line in out.split("\n")[1:]: # ignore `ST` header line = line.strip() if not line: continue if line not in states: states[line] = 0 states[line] += 1 status[host_name] = states logger.debug('Nodes availability') for state, number in sorted(states.items()): logging.debug('{}: {}'.format(state, number)) jobs[host_name] = sum(number for name, number in states.items() if name != 'CG') logging.debug('total: {}'.format(jobs[host_name])) nodes_available = { host_name: host_config.get('max_workers', self.max_workers) - jobs[host_name] for host_name, host_config in self.hosts.items() } total_nodes_available = sum(nodes_available.values()) lines = ['{} nodes available'.format(total_nodes_available)] for host_name, host_nodes_available in sorted(nodes_available.items()): lines.append(' {:<10}: {} nodes available'.format( host_name, max(host_nodes_available, 0))) lines += ['', "Status:"] for host_name in sorted(nodes_available.keys()): lines.append(" " + host_name) for state_name, state_number in sorted(status[host_name].items()): lines.append(" {}: {}".format(state_name, state_number)) lines.append("") return '\n'.join(lines) def submit(self, tasks, container=None, tags=tuple(), working_dir=None, num_workers=None, force_num_tasks=None): """ """ nodes_available = self.available(squash=False) total_nodes_available = sum(nodes_available.values()) print('{} nodes available'.format(total_nodes_available)) for host_name, host_nodes_available in nodes_available.items(): print('{:>20}: {} nodes available'.format(host_name, host_nodes_available)) if not total_nodes_available: return print('Pulling container {} on logging node of hosts {}'.format( container, list(self.hosts.keys()))) result = self.connections.run('sregistry pull {}'.format(container), hide=True, warn=True) print("\nCommand output") for host_name, out in zip(sorted(self.hosts.keys()), result.values()): print(host_name) print(out.stdout) print(out.stderr) for i, host_name in enumerate(sorted(self.hosts.keys())): if not nodes_available[host_name]: continue # filter tasks if they have host attached which != host_name # self.submit_single_host(filtered_tasks) self.submit_single_host(host_name, self.connections[i], tasks, nodes_available[host_name], tags, container, working_dir, num_workers, force_num_tasks) # TODO: make submission separately for each host because they have different number of # available nodes. Also, add a duplication ratio, so that tasks are submitted to multiple # hosts and the faster wins the race. def submit_single_host(self, host_name, connection, tasks, nodes_available, tags, container, working_dir, num_workers, force_num_tasks): if force_num_tasks: n_tasks = force_num_tasks else: n_tasks = len(tasks) array_option = 'array=1-{};'.format(min(n_tasks, nodes_available)) flow_options = FLOW_OPTIONS_TEMPLATE.format(array=array_option, job_name=".".join( sorted(tags))) resources = [] for name, value in tasks[0]['facility']['resources'].items(): if name == 'cpu': resources.append('cpus-per-task={}'.format(value)) elif name == 'gpu': resources.append('gres=gpu:{}'.format(value)) elif name == 'mem': resources.append('mem={}'.format(value)) elif name not in IGNORE_RESOURCES: raise ValueError('Unknown option: {}'.format(name)) flow_options += ";" + ";".join(resources) submission_root = self.hosts[host_name].get('submission_root', self.submission_root) if submission_root is None: raise ValueError( "submission_root is not defined for host {} nor globally.". format(host_name)) submission_dir = os.path.join(submission_root, container) # TODO: Run mkdirs -p with connection.run instead of python's `os`. # this folder should be created in _ensure_remote_setup. if not os.path.isdir(submission_dir): connection.run('mkdir -p {}'.format(submission_dir)) prolog = self.hosts[host_name].get('prolog', self.prolog) flow_command = FLOW_TEMPLATE.format(container=container, root_dir=submission_dir, prolog=prolog, options=flow_options) options = {} if working_dir: options['working-dir'] = working_dir if num_workers: options['num-workers'] = num_workers if options: options = ' ' + ' '.join('--{}={}'.format(k, v) for k, v in options.items()) else: options = '' command = COMMAND_TEMPLATE.format( container=" --container " + container if container else "", tags=" --tags " + " ".join(tags) if tags else "", options=options) submit_command = SUBMIT_COMMANDLINE_TEMPLATE.format(flow=flow_command, command=command) print("Executing on {}:".format(host_name)) print(submit_command) out = connection.run(submit_command, hide=True, warn=True) print("\nCommand output") print("------") print(out.stdout) print(out.stderr)