Exemple #1
0
    def install(self):
        Print.info('Installing rust and cloning the repo...')
        cmd = [
            'sudo apt-get update',
            'sudo apt-get -y upgrade',
            'sudo apt-get -y autoremove',

            # The following dependencies prevent the error: [error: linker `cc` not found].
            'sudo apt-get -y install build-essential',
            'sudo apt-get -y install cmake',

            # Install rust (non-interactive).
            'curl --proto "=https" --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y',
            'source $HOME/.cargo/env',
            'rustup default stable',

            # This is missing from the Rocksdb installer (needed for Rocksdb).
            'sudo apt-get install -y clang',

            # Clone the repo.
            f'(git clone {self.settings.repo_url} || (cd {self.settings.repo_name} ; git pull))'
        ]
        hosts = self.manager.hosts(flat=True)
        try:
            g = Group(*hosts, user='******', connect_kwargs=self.connect)
            g.run(' && '.join(cmd), hide=True)
            Print.heading(f'Initialized testbed of {len(hosts)} nodes')
        except (GroupException, ExecutionError) as e:
            e = FabricError(e) if isinstance(e, GroupException) else e
            raise BenchError('Failed to install repo on testbed', e)
Exemple #2
0
    def _config(self, hosts, node_parameters):
        Print.info('Generating configuration files...')

        # Cleanup all local configuration files.
        cmd = CommandMaker.cleanup()
        subprocess.run([cmd], shell=True, stderr=subprocess.DEVNULL)

        # Recompile the latest code.
        cmd = CommandMaker.compile().split()
        subprocess.run(cmd, check=True, cwd=PathMaker.node_crate_path())

        # Create alias for the client and nodes binary.
        cmd = CommandMaker.alias_binaries(PathMaker.binary_path())
        subprocess.run([cmd], shell=True)

        # Generate configuration files.
        keys = []
        key_files = [PathMaker.key_file(i) for i in range(len(hosts))]
        for filename in key_files:
            cmd = CommandMaker.generate_key(filename).split()
            subprocess.run(cmd, check=True)
            keys += [Key.from_file(filename)]

        # Generate threshold signature files.
        nodes = len(hosts)
        cmd = './node threshold_keys'
        for i in range(nodes):
            cmd += ' --filename ' + PathMaker.threshold_key_file(i)
        cmd = cmd.split()
        subprocess.run(cmd, capture_output=True, check=True)

        names = [x.name for x in keys]
        consensus_addr = [f'{x}:{self.settings.consensus_port}' for x in hosts]
        front_addr = [f'{x}:{self.settings.front_port}' for x in hosts]
        tss_keys = []
        for i in range(nodes):
            tss_keys += [TSSKey.from_file(PathMaker.threshold_key_file(i))]
        ids = [x.id for x in tss_keys]
        mempool_addr = [f'{x}:{self.settings.mempool_port}' for x in hosts]
        committee = Committee(names, ids, consensus_addr, front_addr,
                              mempool_addr)
        committee.print(PathMaker.committee_file())

        node_parameters.print(PathMaker.parameters_file())

        # Cleanup all nodes.
        cmd = f'{CommandMaker.cleanup()} || true'
        g = Group(*hosts, user='******', connect_kwargs=self.connect)
        g.run(cmd, hide=True)

        # Upload configuration files.
        progress = progress_bar(hosts, prefix='Uploading config files:')
        for i, host in enumerate(progress):
            c = Connection(host, user='******', connect_kwargs=self.connect)
            c.put(PathMaker.committee_file(), '.')
            c.put(PathMaker.key_file(i), '.')
            c.put(PathMaker.threshold_key_file(i), '.')
            c.put(PathMaker.parameters_file(), '.')

        return committee
Exemple #3
0
def run_commands(connections: ThreadingGroup, commands_file) -> None:
    """
    Run commands on each remote host and prefix the output (stdout + stderr)
    with the hostname.

    **Note about the host prefix**

    Cf. http://www.fabfile.org/upgrading.html, the "env.output_prefix" option
    has not been ported to fabric 2.X

    We by-pass this by writing the current hostname to a file and we prefix
    each output of the commands run with the hostname

    :param connections:
    :param commands_file:
    :return:
    """

    connection: Connection
    for connection in connections:
        connection.run('echo "{host}" > /tmp/evaneos_ssh__fabric_host'.format(
            host=connection.host))

    try:
        connections.run(
            '{commands} 2>&1 | sed "s/^/[$(cat /tmp/evaneos_ssh__fabric_host)] /"'
            .format(commands=commands_file))
    except GroupException:
        sys.exit(1)
    finally:
        remote_cleanup(connections, commands_file)
Exemple #4
0
 def excepted_command(self):
     group = Group('nopebadhost1', 'nopebadhost2')
     try:
         group.run('lolnope', hide=True)
     except GroupException as e:
         for value in e.result.values():
             ok_(isinstance(value, gaierror))
     else:
         assert False, "Did not raise GroupException!"
Exemple #5
0
 def excepted_command(self):
     group = Group("nopebadhost1", "nopebadhost2")
     try:
         group.run("lolnope", hide=True)
     except GroupException as e:
         for value in e.result.values():
             assert isinstance(value, gaierror)
     else:
         assert False, "Did not raise GroupException!"
Exemple #6
0
 def excepted_command(self):
     group = Group('nopebadhost1', 'nopebadhost2')
     try:
         group.run('lolnope', hide=True)
     except GroupException as e:
         for value in e.result.values():
             assert isinstance(value, gaierror)
     else:
         assert False, "Did not raise GroupException!"
Exemple #7
0
def remote_cleanup(connections: ThreadingGroup, commands_file: str) -> None:
    """
    Cleans up remote hosts' filesystem.

    :param connections: The remote connections
    :param commands_file: The command file to delete
    """
    connections.run('rm {file}'.format(file=commands_file))
    connections.run(
        'rm /tmp/evaneos_ssh__fabric_host'.format(file=commands_file))
Exemple #8
0
 def kill(self, hosts=[], delete_logs=False):
     assert isinstance(hosts, list)
     assert isinstance(delete_logs, bool)
     hosts = hosts if hosts else self.manager.hosts(flat=True)
     delete_logs = CommandMaker.clean_logs() if delete_logs else 'true'
     cmd = [delete_logs, f'({CommandMaker.kill()} || true)']
     try:
         g = Group(*hosts, user='******', connect_kwargs=self.connect)
         g.run(' && '.join(cmd), hide=True)
     except GroupException as e:
         raise BenchError('Failed to kill nodes', FabricError(e))
Exemple #9
0
 def failed_command(self):
     group = Group('localhost', '127.0.0.1')
     try:
         group.run('lolnope', hide=True)
     except GroupException as e:
         # GroupException.result -> GroupResult;
         # GroupResult values will be UnexpectedExit in this case;
         # UnexpectedExit.result -> Result, and thus .exited etc.
         exits = [x.result.exited for x in e.result.values()]
         assert [127, 127] == exits
     else:
         assert False, "Did not raise GroupException!"
Exemple #10
0
 def failed_command(self):
     group = Group("localhost", "127.0.0.1")
     try:
         group.run("lolnope", hide=True)
     except GroupException as e:
         # GroupException.result -> GroupResult;
         # GroupResult values will be UnexpectedExit in this case;
         # UnexpectedExit.result -> Result, and thus .exited etc.
         exits = [x.result.exited for x in e.result.values()]
         assert [127, 127] == exits
     else:
         assert False, "Did not raise GroupException!"
Exemple #11
0
 def failed_command(self):
     group = Group('localhost', '127.0.0.1')
     try:
         group.run('lolnope', hide=True)
     except GroupException as e:
         # GroupException.result -> GroupResult;
         # GroupResult values will be UnexpectedExit in this case;
         # UnexpectedExit.result -> Result, and thus .exited etc.
         eq_(
             [x.result.exited for x in e.result.values()],
             [127, 127],
         )
     else:
         assert False, "Did not raise GroupException!"
Exemple #12
0
 def _update(self, hosts):
     Print.info(
         f'Updating {len(hosts)} nodes (branch "{self.settings.branch}")...'
     )
     cmd = [
         f'(cd {self.settings.repo_name} && git fetch -f)',
         f'(cd {self.settings.repo_name} && git checkout -f {self.settings.branch})',
         f'(cd {self.settings.repo_name} && git pull -f)',
         'source $HOME/.cargo/env',
         f'(cd {self.settings.repo_name}/node && {CommandMaker.compile()})',
         CommandMaker.alias_binaries(
             f'./{self.settings.repo_name}/target/release/')
     ]
     g = Group(*hosts, user='******', connect_kwargs=self.connect)
     g.run(' && '.join(cmd), hide=True)
Exemple #13
0
 def simple_command(self):
     group = Group('localhost', '127.0.0.1')
     result = group.run('echo foo', hide=True)
     eq_(
         [x.stdout.strip() for x in result.values()],
         ['foo', 'foo'],
     )
Exemple #14
0
def run_all2one(bramble, server_ip, niter=niter):
    server = Connection(server_ip, user='******', connect_kwargs=cxn_args)
    ips = [c.host for c in bramble if c.host != server_ip]
    clients = ThreadingGroup(*ips, user='******', connect_kwargs=cxn_args)
    print(f"Begin {len(clients)} clients to 1 server experiment")

    server.run("killall -q iperf", warn=True)
    time.sleep(10)  # wait for old process to die
    server.run(f"iperf -s > {remote_output_dir}/server.log &")

    for i in range(niter):
        print(f"Iteration {i}")
        clients.run("killall -q iperf", warn=True)
        time.sleep(10)  # wait for processes to die
        clients.run(
            f"iperf -P 20 -c {server.host} >> {remote_output_dir}/client.log")

    gather_all2one_results(clients, server)
Exemple #15
0
def upload_command_file_to_remotes(connections: ThreadingGroup) -> str:
    """
    Upload the command file to execute remotely on each remote host.

    :param connections: The remote connections
    :return: The remote path of the command file
    """

    remote_commands_file = '/tmp/{base_filename}.sh'.format(
        base_filename=str(uuid.uuid4()))

    connection: Connection
    for connection in connections:
        connection.put('/var/local/github-actions/commands',
                       remote_commands_file)

    connections.run('chmod a+x {file}'.format(file=remote_commands_file))

    return remote_commands_file
Exemple #16
0
def test_hosts_are_reachable(connections_to_test: ThreadingGroup) -> None:
    """
    Run a dummy command on remote host to make sure they are reachable.

    If an issue arise at connection, display the exception for each connection in error.

    :param connections_to_test: The connections we make sure are reachable
    """

    try:
        connections_to_test.run('echo "dry-run"', hide=True)
    except GroupException as error:
        result: Connection
        for error_connection, error in error.result.items():
            if isinstance(error, Exception):
                print_github_action_error(
                    'Error on "{host}": {message}'.format(
                        host=error_connection.host, message=str(error)))
        sys.exit(1)
 def simple_command(self):
     group = Group("localhost", "127.0.0.1")
     result = group.run("echo foo", hide=True)
     outs = [x.stdout.strip() for x in result.values()]
     assert ["foo", "foo"] == outs
class RemoteFlowResources(Resources):
    """
    """
    def __init__(self,
                 user,
                 hosts,
                 max_workers=100,
                 submission_root=None,
                 prolog="",
                 rc=None):
        """
        """
        if not isinstance(hosts, dict):
            hosts = {host: {} for host in hosts}

        self.user = user
        self.max_workers = max_workers
        self.prolog = prolog
        self.hosts = hosts
        self.submission_root = submission_root
        self.rc = rc
        self.connections = Group(*list(sorted(hosts.keys())), user=user)
        for connection in self.connections:
            connection.connect_timeout = 360

    def init(self):

        for host_name, host_connection in zip(sorted(self.hosts.keys()),
                                              self.connections):
            print('Adding source remoteflowrc in {}:bashrc'.format(host_name))
            try:
                patchwork.files.append(host_connection, '.bashrc',
                                       'source $HOME/.remoteflowrc')
            except paramiko.ssh_exception.AuthenticationException as e:
                print(e)
                continue

            rc_file_path = self.hosts.get('rc', self.rc)

            if rc_file_path is None:
                raise ValueError(
                    "`rc` is not defined for host {} nor globally.".format(
                        host_name))

            try:
                out = host_connection.run('cat .remoteflowrc', hide=True)
                # [:-2] is a dirty hack to drop the line `export CLUSTERNAME={}`
                if '\n'.join(out.stdout.split('\n')[:-2]) != open(
                        rc_file_path, 'r').read():
                    update_rc = True
            except invoke.exceptions.UnexpectedExit:
                update_rc = True

            if update_rc:
                print('Updating {}:.remoteflowrc'.format(host_name))
                host_connection.put(rc_file_path)
                patchwork.files.append(
                    host_connection, '.remoteflowrc',
                    'export CLUSTERNAME={}'.format(host_name))

            print('Updating {}:.config/mahler'.format(host_name))
            patchwork.transfers.rsync(
                host_connection,
                '{}/.remoteconfig/mahler'.format(os.environ['HOME']),
                '.config')

            # if not patchwork.environment.have_program(host_connection, 'flow-submit'):
            print('Installing flow on {}'.format(host_name))
            host_connection.run(
                'pip install --upgrade --user git+https://github.com/bouthilx/flow.git',
                hide=True)

            # if not patchwork.environment.have_program(host_connection, 'sregistry'):
            print('Installing sregistry on {}'.format(host_name))
            host_connection.run('pip install --upgrade --user sregistry[all]',
                                hide=True)

    def available(self, squash=True):
        """
        """
        command = 'squeue -r -o %t -u {user}'.format(user=self.user)
        jobs = {}
        max_workers = 0
        result = self.connections.run(command, hide=True, warn=True)
        for host_name in sorted(self.hosts.keys()):
            logger.debug('squeue on {}'.format(host_name))
            max_workers += self.hosts[host_name].get('max_workers',
                                                     self.max_workers)
            out = result[self.connections[list(sorted(
                self.hosts.keys())).index(host_name)]]
            out = out.stdout
            states = dict()
            for line in out.split("\n")[1:]:  # ignore `ST` header
                line = line.strip()
                if not line:
                    continue

                if line not in states:
                    states[line] = 0

                states[line] += 1

            logger.debug('Nodes availability')
            for state, number in sorted(states.items()):
                logging.debug('{}: {}'.format(state, number))

            jobs[host_name] = sum(number for name, number in states.items()
                                  if name != 'CG')
            logging.debug('total: {}'.format(jobs[host_name]))

        if squash:
            return max(max_workers - sum(jobs.values()), 0)
        else:
            return {
                host_name: host_config.get('max_workers', self.max_workers) -
                jobs[host_name]
                for host_name, host_config in self.hosts.items()
            }

    def run(self, commandline):
        commandline = " ".join(commandline)
        print('executing:\n' + commandline)
        result = self.connections.run(commandline, hide=True, warn=True)
        status = dict()
        for host_name in sorted(self.hosts.keys()):
            print("  {}  ".format(host_name))
            print('-' * (len(host_name) + 4))
            out = result[self.connections[list(sorted(
                self.hosts.keys())).index(host_name)]]
            print('\nstdout')
            print('------')
            print(out.stdout)
            print('\nstderr')
            print('------')
            print(out.stderr)

    def info(self):
        command = 'squeue -r -o %t -u {user}'.format(user=self.user)
        jobs = {}
        max_workers = 0
        result = self.connections.run(command, hide=True)

        status = dict()
        for host_name in self.hosts.keys():
            logger.debug('squeue on {}'.format(host_name))
            max_workers += self.hosts[host_name].get('max_workers',
                                                     self.max_workers)
            out = result[self.connections[list(sorted(
                self.hosts.keys())).index(host_name)]]
            out = out.stdout
            states = dict()
            for line in out.split("\n")[1:]:  # ignore `ST` header
                line = line.strip()
                if not line:
                    continue

                if line not in states:
                    states[line] = 0

                states[line] += 1

            status[host_name] = states

            logger.debug('Nodes availability')
            for state, number in sorted(states.items()):
                logging.debug('{}: {}'.format(state, number))

            jobs[host_name] = sum(number for name, number in states.items()
                                  if name != 'CG')
            logging.debug('total: {}'.format(jobs[host_name]))

        nodes_available = {
            host_name:
            host_config.get('max_workers', self.max_workers) - jobs[host_name]
            for host_name, host_config in self.hosts.items()
        }
        total_nodes_available = sum(nodes_available.values())
        lines = ['{} nodes available'.format(total_nodes_available)]
        for host_name, host_nodes_available in sorted(nodes_available.items()):
            lines.append('  {:<10}: {} nodes available'.format(
                host_name, max(host_nodes_available, 0)))

        lines += ['', "Status:"]
        for host_name in sorted(nodes_available.keys()):
            lines.append("  " + host_name)
            for state_name, state_number in sorted(status[host_name].items()):
                lines.append("    {}: {}".format(state_name, state_number))
            lines.append("")

        return '\n'.join(lines)

    def submit(self,
               tasks,
               container=None,
               tags=tuple(),
               working_dir=None,
               num_workers=None,
               force_num_tasks=None):
        """
        """
        nodes_available = self.available(squash=False)
        total_nodes_available = sum(nodes_available.values())
        print('{} nodes available'.format(total_nodes_available))
        for host_name, host_nodes_available in nodes_available.items():
            print('{:>20}: {} nodes available'.format(host_name,
                                                      host_nodes_available))

        if not total_nodes_available:
            return

        print('Pulling container {} on logging node of hosts {}'.format(
            container, list(self.hosts.keys())))
        result = self.connections.run('sregistry pull {}'.format(container),
                                      hide=True,
                                      warn=True)
        print("\nCommand output")
        for host_name, out in zip(sorted(self.hosts.keys()), result.values()):
            print(host_name)
            print(out.stdout)
            print(out.stderr)

        for i, host_name in enumerate(sorted(self.hosts.keys())):
            if not nodes_available[host_name]:
                continue

            # filter tasks if they have host attached which != host_name

            # self.submit_single_host(filtered_tasks)
            self.submit_single_host(host_name, self.connections[i], tasks,
                                    nodes_available[host_name], tags,
                                    container, working_dir, num_workers,
                                    force_num_tasks)

        # TODO: make submission separately for each host because they have different number of
        # available nodes. Also, add a duplication ratio, so that tasks are submitted to multiple
        # hosts and the faster wins the race.

    def submit_single_host(self, host_name, connection, tasks, nodes_available,
                           tags, container, working_dir, num_workers,
                           force_num_tasks):

        if force_num_tasks:
            n_tasks = force_num_tasks
        else:
            n_tasks = len(tasks)

        array_option = 'array=1-{};'.format(min(n_tasks, nodes_available))
        flow_options = FLOW_OPTIONS_TEMPLATE.format(array=array_option,
                                                    job_name=".".join(
                                                        sorted(tags)))

        resources = []
        for name, value in tasks[0]['facility']['resources'].items():
            if name == 'cpu':
                resources.append('cpus-per-task={}'.format(value))
            elif name == 'gpu':
                resources.append('gres=gpu:{}'.format(value))
            elif name == 'mem':
                resources.append('mem={}'.format(value))
            elif name not in IGNORE_RESOURCES:
                raise ValueError('Unknown option: {}'.format(name))

        flow_options += ";" + ";".join(resources)

        submission_root = self.hosts[host_name].get('submission_root',
                                                    self.submission_root)
        if submission_root is None:
            raise ValueError(
                "submission_root is not defined for host {} nor globally.".
                format(host_name))
        submission_dir = os.path.join(submission_root, container)
        # TODO: Run mkdirs -p with connection.run instead of python's `os`.
        #       this folder should be created in _ensure_remote_setup.
        if not os.path.isdir(submission_dir):
            connection.run('mkdir -p {}'.format(submission_dir))

        prolog = self.hosts[host_name].get('prolog', self.prolog)

        flow_command = FLOW_TEMPLATE.format(container=container,
                                            root_dir=submission_dir,
                                            prolog=prolog,
                                            options=flow_options)

        options = {}
        if working_dir:
            options['working-dir'] = working_dir
        if num_workers:
            options['num-workers'] = num_workers

        if options:
            options = ' ' + ' '.join('--{}={}'.format(k, v)
                                     for k, v in options.items())
        else:
            options = ''

        command = COMMAND_TEMPLATE.format(
            container=" --container " + container if container else "",
            tags=" --tags " + " ".join(tags) if tags else "",
            options=options)

        submit_command = SUBMIT_COMMANDLINE_TEMPLATE.format(flow=flow_command,
                                                            command=command)

        print("Executing on {}:".format(host_name))
        print(submit_command)
        out = connection.run(submit_command, hide=True, warn=True)
        print("\nCommand output")
        print("------")
        print(out.stdout)
        print(out.stderr)