def test_able_to_sync_jupyter(): user = USER_47 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) local_port = get_free_local_port() jupyter = node.deploy_notebook(local_port=local_port) stack.enter_context(cancel_on_exit(jupyter)) deployments = cluster.pull_deployments() assert not deployments.jupyter_deployments cluster.push_deployment(deployment=jupyter) deployments = cluster.pull_deployments() print(deployments) assert len(deployments.jupyter_deployments) == 1 jupyter_2 = deployments.jupyter_deployments[0] try: assert jupyter.local_port != jupyter_2.local_port check_local_http_connection(port=jupyter.local_port) check_local_http_connection(port=jupyter_2.local_port) finally: jupyter_2.cancel_local()
def test_cancelled_dask_allocation_is_discarded_on_pull(): user = USER_56 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) dask = deploy_dask(nodes) stack.enter_context(cancel_on_exit(dask)) try: deployments = cluster.pull_deployments() assert not deployments.jupyter_deployments cluster.push_deployment(deployment=dask) dask.cancel() dask = None deployments = cluster.pull_deployments() assert not deployments.jupyter_deployments finally: if dask is not None: dask.cancel()
def test_able_to_sync_dask(): user = USER_55 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) dask = deploy_dask(nodes) stack.enter_context(cancel_on_exit(dask)) deployments = cluster.pull_deployments() assert not deployments.dask_deployments cluster.push_deployment(deployment=dask) deployments = cluster.pull_deployments() print(deployments) assert len(deployments.dask_deployments) == 1 dask_2 = deployments.dask_deployments[0] try: assert dask.diagnostics.addresses != dask_2.diagnostics.addresses for url in dask.diagnostics.addresses: check_http_connection(url=url) for url in dask_2.diagnostics.addresses: check_http_connection(url=url) finally: dask_2.cancel_local()
def test_cancelled_jupyter_allocation_is_discarded_on_pull(): user = USER_48 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) local_port = get_free_local_port() jupyter = node.deploy_notebook(local_port=local_port) try: deployments = cluster.pull_deployments() assert not deployments.jupyter_deployments cluster.push_deployment(deployment=jupyter) jupyter.cancel() jupyter = None deployments = cluster.pull_deployments() assert not deployments.jupyter_deployments finally: if jupyter is not None: jupyter.cancel()
def test_nodes_sync_does_not_work_when_waiting_twice(): """Port info was already deleted, so waiting for the second time defaults to port 22.""" user = USER_44 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() with cancel_on_exit(nodes): cluster.push_deployment(deployment=nodes) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() node = nodes[0] assert node.port != 22 deployments = cluster.pull_deployments() assert len(deployments.nodes) == 1 nodes_2 = deployments.nodes[0] nodes_2.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes_2.running() node_2 = nodes_2[0] assert node_2.port == 22 assert node_2.host == node.host
def test_dask_deployment_with_redeploy_failure(): user = USER_42 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) assert idact.detail.dask.deploy_dask_impl.validate_worker stored_validate_worker = \ idact.detail.dask.deploy_dask_impl.validate_worker def fake_validate_worker(worker: DaskWorkerDeployment): print("Fake worker validation.") raise ValueError("Fake worker validation fail.") try: idact.detail.dask.deploy_dask_impl.validate_worker = \ fake_validate_worker with pytest.raises(RuntimeError): with deploy_dask_on_testing_cluster(nodes): pass finally: idact.detail.dask.deploy_dask_impl.validate_worker = \ stored_validate_worker
def cancel(self): log = get_logger(__name__) with ExitStack() as stack: stack.enter_context( stage_info(log, "Cancelling Jupyter deployment.")) stack.enter_context(cancel_on_exit(self._deployment)) self.cancel_local()
def discard_invalid_workers(workers: List[DaskWorkerDeployment], stack: ExitStack) \ -> Tuple[ List[DaskWorkerDeployment], List[Node]]: """Validates each worker. Returns a tuple of valid workers and nodes for which the workers could not be validated. :param workers: Workers to validate. :param stack: Exit stack. Failed workers will be cancelled on exit. """ log = get_logger(__name__) valid_workers = [] nodes_to_redeploy = [] worker_count = len(workers) for i, worker in enumerate(workers): try: with stage_info(log, "Validating worker %d/%d.", i + 1, worker_count): validate_worker(worker=worker) valid_workers.append(worker) except Exception: # noqa, pylint: disable=broad-except log.debug("Failed to validate worker. Exception:", exc_info=1) nodes_to_redeploy.append(worker.deployment.node) stack.enter_context(cancel_on_exit(worker)) return valid_workers, nodes_to_redeploy
def test_remove_runtime_dir_test(): user = USER_15 with ExitStack() as stack: stack.enter_context(set_up_key_location(user)) stack.enter_context(disable_pytest_stdin()) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] try: nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() check_will_remove_empty(node=node) check_will_ignore_non_existent(node=node) check_will_remove_files(node=node) check_will_not_remove_dotfiles(node=node) check_will_not_remove_nested_dirs(node=node) finally: node.run("rm -rf *")
def deploy_jupyter(nodes: Nodes): ps_jupyter = "ps -u $USER | grep jupyter ; exit 0" node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() local_port = get_free_local_port() deployment = node.deploy_notebook(local_port=local_port) with cancel_on_exit(deployment): print(deployment) assert str(deployment) == repr(deployment) assert deployment.local_port == local_port ps_jupyter_lines = node.run(ps_jupyter).splitlines() pprint(ps_jupyter_lines) assert len(ps_jupyter_lines) == 1 check_local_http_connection(port=local_port) yield node retry(lambda: check_no_output(node=node, command=ps_jupyter), retries=5 * get_testing_process_count(), seconds_between_retries=1)
def check_remote_key_and_node_access(stack: ExitStack, user: str): public_key_value = get_public_key_value() cluster = show_cluster(name=TEST_CLUSTER) node = cluster.get_access_node() with set_password(get_test_user_password(user)): assert node.run('whoami') == user assert node.run('whoami') == user node.run("grep '{public_key_value}' ~/.ssh/authorized_keys".format( public_key_value=public_key_value)) with pytest.raises(RuntimeError): node.run( "grep '{public_key_value}' ~/.ssh/authorized_keys.idact".format( public_key_value=public_key_value)) nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) print(nodes) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) node.run("grep '{public_key_value}' ~/.ssh/authorized_keys.idact".format( public_key_value=public_key_value)) # Access to node without password works. assert nodes[0].run('whoami') == user check_direct_access_from_access_node_does_not_work(nodes[0])
def test_node_tunnel_fall_back_when_local_port_taken(): """Checks that a tunnel will fall back to a random port if local port is taken.""" user = USER_53 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) there = get_free_remote_port(node=node) here = get_free_local_port() tunnel_1 = node.tunnel(there=there, here=here) stack.enter_context(close_tunnel_on_exit(tunnel_1)) assert tunnel_1.here == here tunnel_2 = node.tunnel(there=there, here=here) stack.enter_context(close_tunnel_on_exit(tunnel_2)) assert tunnel_2.here != here
def cancel(self): """Cancels the scheduler deployment.""" log = get_logger(__name__) with ExitStack() as stack: stack.enter_context( stage_info(log, "Cancelling scheduler deployment on %s.", self._deployment.node.host)) stack.enter_context(cancel_on_exit(self._deployment)) self.cancel_local()
def deploy_dask_on_testing_cluster(nodes: Nodes): ps_dask_worker = "ps -u $USER | grep [d]ask-worker ; exit 0" ps_dask_scheduler = "ps -u $USER | grep [d]ask-scheduler ; exit 0" node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() ps_lines = node.run(ps_dask_worker).splitlines() pprint(ps_lines) assert not ps_lines deployment = deploy_dask(nodes=nodes) with cancel_on_exit(deployment): print(deployment) assert str(deployment) == repr(deployment) ps_lines = node.run(ps_dask_scheduler).splitlines() pprint(ps_lines) assert len(ps_lines) == 1 ps_lines = node.run(ps_dask_worker).splitlines() pprint(ps_lines) # some workers may have been redeployed assert len(ps_lines) >= len(nodes) client = deployment.get_client() print(client) check_submission_works(node=node, client=client) pprint(deployment.diagnostics.addresses) assert len(deployment.diagnostics.addresses) == len(nodes) + 1 for address in deployment.diagnostics.addresses: request = requests.get(address) assert "text/html" in request.headers['Content-type'] opened_addresses = [] with save_opened_in(opened_addresses): deployment.diagnostics.open_all() assert opened_addresses == deployment.diagnostics.addresses yield node retry(lambda: check_no_output(node=node, command=ps_dask_scheduler), retries=5 * get_testing_process_count(), seconds_between_retries=1) retry(lambda: check_no_output(node=node, command=ps_dask_worker), retries=5 * get_testing_process_count(), seconds_between_retries=1)
def test_node_tunnel_stress(): user = USER_40 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100)) stack.enter_context(cancel_on_exit(nodes)) run_tunnel_stress_test(stack=stack, user=user, nodes=nodes)
def test_generic_deployment(): user = USER_7 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) print(cluster) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() assert isinstance(node, NodeInternal) runtime_dir = create_runtime_dir(node=node) stack.enter_context( remove_runtime_dir_on_failure(node=node, runtime_dir=runtime_dir)) script_contents = "echo ABC && sleep 30" assert isinstance(node, NodeInternal) deployment = deploy_generic(node=node, script_contents=script_contents, runtime_dir=runtime_dir) with cancel_on_exit(deployment): print(deployment) node.run("kill -0 {pid}".format(pid=deployment.pid)) with pytest.raises(RuntimeError): node.run("kill -0 {pid}".format(pid=deployment.pid))
def test_node_tunnel_fall_back_when_local_port_free_but_fails(): """Checks that a tunnel will fall back to a random port if local port is is initially free, but tunnel cannot be created anyway (e.g. another process binds to it at the last moment).""" user = USER_54 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) there = get_free_remote_port(node=node) here = get_free_local_port() real_build_tunnel = idact.detail.nodes.node_impl.build_tunnel sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tries = [0] def fake_build_tunnel(*args, **kwargs) -> TunnelInternal: tries[0] += 1 if tries[0] == 1: raise RuntimeError("Fake failure.") if tries[0] != 2: assert False return real_build_tunnel(*args, **kwargs) try: idact.detail.nodes.node_impl.build_tunnel = fake_build_tunnel tunnel = node.tunnel(there=there, here=here) stack.enter_context(close_tunnel_on_exit(tunnel)) assert tries[0] == 2 assert tunnel.here != here finally: idact.detail.nodes.node_impl.build_tunnel = real_build_tunnel sock.close()
def test_jupyter_deployment(): user = USER_6 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) with deploy_jupyter(nodes): pass
def test_able_to_reach_nodes_when_using_password_based_authentication(): """It should be possible to connect to compute nodes even when using password-based authentication, because local public key is authorized for the compute nodes after initial connection. However, direct connection from access node should fail. Password is still used between the client and the access node.""" user = USER_10 with ExitStack() as stack: stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user=user, auth=AuthMethod.ASK)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(disable_pytest_stdin()) cluster = show_cluster(TEST_CLUSTER) node = cluster.get_access_node() nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) print(nodes) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) compute_node = nodes[0] assert isinstance(compute_node, NodeInternal) public_key_value = get_public_key_value() # Local key was installed for the deployed sshd, allowing access # between the access node and compute nodes. assert nodes[0].run('whoami') == user # Local key was not installed for the access node with pytest.raises(RuntimeError): node.run("grep '{public_key_value}' ~/.ssh/authorized_keys".format( public_key_value=public_key_value)) # But it was installed for compute nodes. node.run("grep '{public_key_value}'" " ~/.ssh/authorized_keys.idact".format( public_key_value=public_key_value)) check_direct_access_from_access_node_does_not_work(nodes[0])
def test_dask_deployment_with_absolute_scratch_path(): user = USER_24 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) cluster.config.scratch = '/home/user-24' nodes = cluster.allocate_nodes(nodes=1, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=10)) stack.enter_context(cancel_on_exit(nodes)) with deploy_dask_on_testing_cluster(nodes): pass
def test_migrate_deployments(): """Migrating from an old version of the deployments file should work without fatal errors.""" user = USER_57 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) access_node = cluster.get_access_node() assert isinstance(access_node, NodeInternal) def check_deployments_file_exists(): access_node.run("cat ~/.idact/.deployments") nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) with pytest.raises(RuntimeError): check_deployments_file_exists() remote_path = access_node.run("echo ~/.idact/.deployments") put_file_on_node(node=access_node, remote_path=remote_path, contents='{{"type": "{type}"}}'.format( type=SerializableTypes.DEPLOYMENT_DEFINITIONS)) deployments = cluster.pull_deployments() assert not deployments.nodes cluster.push_deployment(deployment=nodes) deployments = cluster.pull_deployments() assert len(deployments.nodes) == 1 cluster.clear_pushed_deployments() with pytest.raises(RuntimeError): check_deployments_file_exists()
def run_tunnel_test(user: str, nodes: Nodes): node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() with ExitStack() as stack: stack.enter_context(cancel_on_exit(nodes)) there = get_free_remote_port(node=nodes[0]) here = get_free_local_port() server = start_dummy_server_thread(user=user, server_port=there) stack.enter_context(join_on_exit(server)) tunnel = node.tunnel(there=there, here=here) stack.enter_context(close_tunnel_on_exit(tunnel)) print(tunnel) assert str(tunnel) == repr(tunnel) assert tunnel.here == here assert tunnel.there == there def access_dummy_server(): return requests.get( "http://127.0.0.1:{local_port}".format(local_port=here)) request = retry(access_dummy_server, retries=5 * get_testing_process_count(), seconds_between_retries=2) assert "text/html" in request.headers['Content-type'] ssh_tunnel = node.tunnel_ssh() stack.enter_context(close_tunnel_on_exit(ssh_tunnel)) assert str(ssh_tunnel) == repr(ssh_tunnel) assert str(ssh_tunnel).startswith("ssh ") assert user in str(ssh_tunnel) assert str(ssh_tunnel.here) in str(ssh_tunnel) assert ssh_tunnel.there == node.port assert not nodes.running() with pytest.raises(RuntimeError): nodes.wait() with pytest.raises(RuntimeError): node.tunnel(there=there, here=here)
def test_basic(): user = USER_1 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) clusters = show_clusters() print(clusters) assert len(clusters) == 1 cluster = show_cluster(name=TEST_CLUSTER) print(cluster) assert clusters[TEST_CLUSTER] == cluster nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30), native_args={'--partition': 'debug'}) with cancel_on_exit(nodes): assert len(nodes) == 2 assert nodes[0] in nodes print(nodes) assert str(nodes) == repr(nodes) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() print(nodes) print(nodes[0]) assert nodes[0].run('whoami') == user assert nodes[1].run('whoami') == user assert not nodes.running() with pytest.raises(RuntimeError): nodes.wait() with pytest.raises(RuntimeError): nodes[0].run('whoami')
def test_dask_deployment_with_redeploy_on_validation_failure(): user = USER_41 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) assert idact.detail.dask.deploy_dask_impl.validate_worker stored_validate_worker = \ idact.detail.dask.deploy_dask_impl.validate_worker fake_validation_counter = [0] # pylint: disable=unused-argument def fake_validate_worker(worker: DaskWorkerDeployment): current_count = fake_validation_counter[0] fake_validation_counter[0] = current_count + 1 print("Fake worker validation.") if current_count == 0: raise RuntimeError("Fake worker validation: First node fail.") print("Deciding the worker is valid.") try: idact.detail.dask.deploy_dask_impl.validate_worker = \ fake_validate_worker with deploy_dask_on_testing_cluster(nodes): pass assert fake_validation_counter[0] == 3 finally: idact.detail.dask.deploy_dask_impl.validate_worker = \ stored_validate_worker
def test_dask_deployment_with_setup_actions(): user = USER_18 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes(nodes=2, cores=1, memory_per_node=MiB(100), walltime=Walltime(minutes=30)) stack.enter_context(cancel_on_exit(nodes)) cluster.config.setup_actions.dask = ['echo ABC > file.txt', 'mv file.txt file2.txt'] with deploy_dask_on_testing_cluster(nodes) as node: assert node.run("cat file2.txt") == "ABC"
def test_can_read_node_resources(): user = USER_39 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) access_node = cluster.get_access_node() print(str(access_node)) assert str(access_node) == repr(access_node) assert access_node.resources.cpu_cores is None assert access_node.resources.memory_total is None start_stress_cpu(user=user, timeout=10) try: check_resources_in_believable_range(access_node.resources) finally: stop_stress_cpu(user=user) nodes = cluster.allocate_nodes(cores=1, memory_per_node=bitmath.GiB(0.8)) assert len(nodes) == 1 node = nodes[0] stack.enter_context(cancel_on_exit(nodes)) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() assert node.resources.cpu_cores == 1 assert node.resources.memory_total == bitmath.GiB(0.8) start_stress_cpu(user=user, timeout=10) try: check_resources_in_believable_range(access_node.resources) finally: stop_stress_cpu(user=user) assert node.run('whoami') == user
def test_clear_deployments(): user = USER_46 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) stack.enter_context(clear_deployment_sync_data(user)) cluster = show_cluster(name=TEST_CLUSTER) access_node = cluster.get_access_node() def check_deployments_file_exists(): access_node.run("cat ~/.idact/.deployments") nodes = cluster.allocate_nodes() with cancel_on_exit(nodes): nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() with pytest.raises(RuntimeError): check_deployments_file_exists() cluster.push_deployment(deployment=nodes) check_deployments_file_exists() deployments = cluster.pull_deployments() assert len(deployments.nodes) == 1 assert deployments.nodes[0].running() check_deployments_file_exists() cluster.clear_pushed_deployments() with pytest.raises(RuntimeError): check_deployments_file_exists() deployments = cluster.pull_deployments() assert not deployments.nodes
def test_allocation_should_default_to_port_22_if_port_info_file_is_missing(): user = USER_61 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) node = cluster.get_access_node() nodes = cluster.allocate_nodes(memory_per_node=MiB(100)) stack.enter_context(cancel_on_exit(nodes)) retry(lambda: node.run("rm ~/.idact/sshd_ports/alloc-*/*"), retries=SLURM_WAIT_TIMEOUT, seconds_between_retries=1) nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() assert nodes[0].port == 22
def test_allocate_defaults(): user = USER_22 with ExitStack() as stack: stack.enter_context(disable_pytest_stdin()) stack.enter_context(set_up_key_location(user)) stack.enter_context(reset_environment(user)) stack.enter_context(set_password(get_test_user_password(user))) cluster = show_cluster(name=TEST_CLUSTER) nodes = cluster.allocate_nodes() stack.enter_context(cancel_on_exit(nodes)) assert len(nodes) == 1 node = nodes[0] nodes.wait(timeout=SLURM_WAIT_TIMEOUT) assert nodes.running() assert node.resources.cpu_cores == 1 assert node.resources.memory_total == bitmath.GiB(1) print(node) assert node.run('whoami') == user
def cancel(self): with ExitStack() as stack: stack.enter_context(cancel_on_exit(self._scheduler)) for worker in self._workers: stack.enter_context(cancel_on_exit(worker))