def test_create_docker_task(): dp = DockerProvider() docker = dp.docker env_vars = {'hello': 'team'} taskdef = TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, parent='parent', env=env_vars, inputs={ 'hello': '123', 'child': False, }, # disables any output. # this is hacky and should be refactored # we need a proper way to disable all logging upstream='disabled', ) # run task task = dp.spawn(taskdef) assert task.id == taskdef.id assert hasattr(task, 'container') assert hasattr(task.container, 'id') # try to grab the container from docker api container = docker.containers.get(task.container.id) assert task.container == container # make sure container is properly labeled assert container.labels == { LABEL_TASK_ID: task.id, LABEL_PARENT_ID: 'parent', } # wait for container to execute result = container.wait() assert result['StatusCode'] == 0 # test task will dump info as json, so we can pick it up # make sure it matches what we put in. logs = container.logs() task_dump = json.loads(logs) # taskdef assert taskdef.serialize() == task_dump['taskdef'] # actual environment variables for key, val in env_vars.items(): assert task_dump['env'][key] == val
def test_create_docker_task(): dp = DockerProvider() docker = dp.docker env_vars = {'hello': 'team'} taskdef = TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, parent='parent', env=env_vars, inputs={ 'hello': '123', 'child': False, }, ) # run task task = dp.spawn(taskdef) assert task.id == taskdef.id assert hasattr(task, 'container') assert hasattr(task.container, 'id') # try to grab the container from docker api container = docker.containers.get(task.container.id) assert task.container == container # make sure container is properly labeled assert container.labels[LABEL_TASK_ID] == task.id assert container.labels[LABEL_PARENT_ID] == 'parent' # test task will dump info as json, so we can pick it up # make sure it matches what we put in. task_dump = None for msg in task.logs(): print(msg) if msg['type'] == 'task/log': task_dump = json.loads(msg['data']) # wait for container to execute result = container.wait() assert result['StatusCode'] == 0 # taskdef assert task_dump is not None assert taskdef.serialize() == task_dump['taskdef'] # actual environment variables for key, val in env_vars.items(): assert task_dump['env'][key] == val
async def run(self, name: str, image: str, env: dict = {}, routes: dict = {}, ports: dict = {}, cpu: any = 0, memory: any = 0, **inputs): taskdef = TaskDefinition( name=name, image=image, parent=self.id, inputs=inputs, ports=ports, routes=routes, cpu=cpu, memory=memory, env={ **self.env, **env, }, ) # run it task = self.cluster.spawn(taskdef) # wait for container to exit await self.watch(task) # clean up self.cluster.destroy(task)
def extract_pod_taskdef(pod) -> TaskDefinition: for container in pod.spec.containers: for env in container.env: if env.name == ENV_TASK_DEFINITION: taskdef = env_unpack(env.value) return TaskDefinition(**taskdef) raise Exception('Failed to extract pod task definition')
def test_docker_child_task(): dp = DockerProvider() task = dp.spawn(TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, inputs={'child': True}, )) # wait for the child to spawn child = None for i in range(0, 10): children = dp.find_child_containers(task.id) if len(children) > 0: child = children[0] break time.sleep(0.5) # make sure we got a child assert child is not None # test list tasks tasks = dp.list_all() task_ids = list(map(lambda t: t.id, tasks)) assert task.id in task_ids assert child.labels[LABEL_TASK_ID] in task_ids # kill the whole family dp.destroy(task.id) children = dp.find_child_containers(task.id) assert len(children) == 0
async def init_node(self): cluster = env_get_cluster_provider() parent = env_get_task_definition() token = os.getenv(ENV_KERNEL_TOKEN) taskdef = TaskDefinition( name='kernel', image=parent.image, parent=parent.id, volumes=parent.volumes, env=parent.env, upstream=get_local_url(), meta={ **parent.meta, 'virtual': True, }, ) # set up notebook node self.node = NotebookNode(taskdef) await self.node.start(token) # instantiate kernel task self.task = KernelTask(node=self.node, cluster=cluster, taskdef=taskdef) # write globals self.shell.push({ 'cowait': cowait, 'kernel': self.task, 'tasks': self.task.subtasks, 'NotebookRunner': NotebookRunner, })
def create_env(self, taskdef: TaskDefinition) -> dict: """ Create a container environment dict from a task definition. Arguments: taskdef (TaskDefinition): Task definition Returns: env (dict): Environment variable dict """ env = { **taskdef.env, ENV_GZIP_ENABLED: '1', ENV_TASK_CLUSTER: env_pack(self.serialize()), ENV_TASK_DEFINITION: env_pack(taskdef.serialize()), } # check total length of environment data length = 0 for key, value in env.items(): length += len(str(key)) + len(str(value)) if length > MAX_ENV_LENGTH: raise ProviderError(f'Task environment too long. Was {length}, max: {MAX_ENV_LENGTH}') return env
def env_get_task_definition(): if ENV_TASK_DEFINITION not in os.environ: raise ValueError( f'Task definition must be passed in the ' f'{ENV_TASK_DEFINITION} environment variable.') taskdef_json = json.loads(os.environ[ENV_TASK_DEFINITION]) return TaskDefinition.deserialize(taskdef_json)
async def send_init(self, taskdef: TaskDefinition) -> None: """ Send a task initialization message. Arguments: taskdef (TaskDefinition): New task definition """ await self.msg(TASK_INIT, task=taskdef.serialize())
def agent( config: CowaitConfig, detach: bool = False, upstream: str = None, ) -> None: logger = RunLogger(quiet=False, raw=False) try: cluster = config.get_cluster() if cluster.type == 'api': raise CliError('Error: Cant deploy agent using an API cluster') token = uuid() if cluster.type == 'docker': token = '' cluster.destroy('agent') # create task definition taskdef = TaskDefinition( id='agent', name='cowait.tasks.agent', image=DEFAULT_BASE_IMAGE, upstream=upstream, routes={ '/': 80, }, meta={ 'http_token': token, }, ) # submit task to cluster task = cluster.spawn(taskdef) if detach: logger.header('detached') return def destroy(*args): logger.header('interrupt') cluster.destroy(task.id) sys.exit(0) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) logger.header('task output') for log in logs: logger.handle(log) logger.header() except ProviderError as e: raise CliError(f'Provider error: {e}') except TaskCreationError as e: raise CliError(f'Task creation error: {e}')
def spawn_test_task(name, **taskdef: dict) -> Task: """ Spawns a task using the test cluster provider """ provider = get_test_provider() return provider.spawn( TaskDefinition(**{ 'name': name, 'image': DEFAULT_BASE_IMAGE, **taskdef }))
def agent( config: CowaitConfig, detach: bool = False, upstream: str = None, ) -> None: try: context = CowaitContext.open() cluster_name = context.get('cluster', config.default_cluster) cluster = config.get_cluster(cluster_name) if cluster.type == 'api': raise CliError('Error: Cant deploy agent using an API cluster') cluster.destroy('agent') # create task definition taskdef = TaskDefinition( id='agent', name='cowait.tasks.agent', image=DEFAULT_BASE_IMAGE, upstream=upstream, routes={ '/': 80, }, meta={ 'http_token': uuid(), }, ) # submit task to cluster task = cluster.spawn(taskdef) if detach: printheader('detached') return def destroy(*args): print() printheader('interrupt') cluster.destroy(task.id) sys.exit(0) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) printheader('task output') for log in logs: print(log, flush=True) printheader() except ProviderError as e: raise CliError(f'Provider error: {e}') except TaskCreationError as e: raise CliError(f'Task creation error: {e}')
def test_rpc(): dp = DockerProvider() task = dp.spawn(TaskDefinition( name='cowait.test.tasks.rpc_parent', image=DEFAULT_BASE_IMAGE, )) # wait for execution result = task.container.wait() assert result['StatusCode'] == 0
def test_max_env_length(): """ Passing too large inputs should raise a ProviderError """ random_data = uuid(2 * MAX_ENV_LENGTH, lower=False) with pytest.raises(ProviderError): cp = ClusterProvider('test') cp.create_env(TaskDefinition( 'test-task', image='imaginary-image', inputs={ 'ohshit': random_data, }, ))
def test_docker_task_error(): dp = DockerProvider() task = dp.spawn(TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, inputs={'error': True}, )) container = dp.docker.containers.get(task.container.id) assert task.container == container result = container.wait() assert result['StatusCode'] != 0
def create_env(self, taskdef: TaskDefinition) -> dict: """ Create a container environment dict from a task definition. Arguments: taskdef (TaskDefinition): Task definition Returns: env (dict): Environment variable dict """ return { **taskdef.env, ENV_TASK_CLUSTER: json.dumps(self.serialize()), ENV_TASK_DEFINITION: json.dumps(taskdef.serialize()), }
def test_docker_child_error(): dp = DockerProvider() task = dp.spawn(TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, inputs={'child_error': True}, )) container = dp.docker.containers.get(task.container.id) assert task.container == container # child error should cause the parent to fail result = container.wait() assert result['StatusCode'] != 0
def test( config: CowaitConfig, push: bool, ): logger = TestLogger() try: context = CowaitContext.open() cluster = config.get_cluster() if push: run_push() else: run_build() # execute the test task within the current image task = cluster.spawn( TaskDefinition( name='cowait.test', image=context.image, )) def destroy(*args): logger.header('interrupt') cluster.destroy(task.id) sys.exit(1) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) logger.header('task output') for msg in logs: logger.handle(msg) logger.header() # grab task result passing = cluster.wait(task) sys.exit(0 if passing else 1) except ProviderError as e: logger.print_exception(f'Provider Error: {e}') sys.exit(1) except TaskCreationError as e: logger.print_exception(f'Error creating task: {e}') sys.exit(1)
async def spawn( self, name: str, image: str, id: str = None, ports: dict = {}, routes: dict = {}, inputs: dict = {}, meta: dict = {}, env: dict = {}, cpu: str = '0', memory: str = '0', owner: str = '', **kwargs: dict, ) -> dict: if not isinstance(name, str) and issubclass(name, Task): name = name.__module__ # todo: throw error if any input is a coroutine task = self.cluster.spawn( TaskDefinition( id=id, name=name, image=image, upstream=get_local_url(), meta=meta, ports=ports, routes=routes, env=env, cpu=cpu, memory=memory, owner=owner, inputs={ **inputs, **kwargs, }, )) # authorize id self.node.server.auth.add_token(id) # register with subtask manager self.subtasks.watch(task) return task.serialize()
def base_environment(cluster, taskdef: TaskDefinition) -> dict: """ Create a container environment dict from a task definition. Arguments: taskdef (TaskDefinition): Task definition Returns: env (dict): Environment variable dict """ return { **taskdef.env, ENV_GZIP_ENABLED: '1', ENV_TASK_CLUSTER: env_pack(cluster.serialize()), ENV_TASK_DEFINITION: env_pack(taskdef.serialize()), }
def test( config: CowaitConfig, push: bool, ): try: context = CowaitContext.open() cluster_name = context.get('cluster', config.default_cluster) cluster = config.get_cluster(cluster_name) if push: run_push() else: run_build() # execute the test task within the current image task = cluster.spawn( TaskDefinition( name='cowait.test', image=context.get_image_name(), )) def destroy(*args): print() printheader('interrupt') cluster.destroy(task.id) os._exit(1) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) printheader('task output') for log in logs: print(log, flush=True) except TaskCreationError as e: printheader('error') print('Error creating task:', str(e)) except ProviderError as e: printheader('error') print('Provider error:', str(e)) finally: printheader()
def test_kill_docker_task(): dp = DockerProvider() task = dp.spawn( TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, inputs={'forever': True}, )) # ensure container exists dp.docker.containers.get(task.container.id) # destroy it dp.destroy(task.id) # ensure it no longer exists with pytest.raises(docker.errors.NotFound): dp.docker.containers.get(task.container.id)
def test_kill_docker_task(): dp = DockerProvider() task = dp.spawn(TaskDefinition( name=TEST_TASK, image=TEST_IMAGE, inputs={'forever': True}, )) # ensure container exists dp.docker.containers.get(task.container.id) # destroy it dp.destroy(task.id) # ensure it no longer exists with pytest.raises(docker.errors.NotFound): try: dp.docker.containers.get(task.container.id) except requests.exceptions.ChunkedEncodingError: # workaround for docker for mac bug: # https://github.com/docker/docker-py/issues/2696 raise docker.errors.NotFound('Not found')
def run( config: CowaitConfig, task: str, name: str = None, cluster_name: str = None, inputs: dict = {}, env: dict = {}, ports: dict = {}, routes: dict = {}, build: bool = False, upstream: str = None, detach: bool = False, cpu: str = '0', memory: str = '0', ): try: context = CowaitContext.open() cluster_name = context.get('cluster', config.default_cluster) cluster = config.get_cluster(cluster_name) # figure out image name image, task = parse_task_image_name(task, None) if image is None: if build: build_cmd() image = context.get_image_name() # default to agent as upstream agent = cluster.find_agent() # create task definition taskdef = TaskDefinition( id=name, name=task, image=image, inputs=inputs, env={ **context.get('environment', {}), **env, }, ports=ports, routes=routes, upstream=context.coalesce('upstream', upstream, agent), parent=None, # root task owner=os.getlogin(), cpu=cpu, memory=memory, ) # print execution info printheader('task') print(' task: ', taskdef.id) print(' cluster: ', cluster_name) if taskdef.upstream: print(' upstream: ', taskdef.upstream) print(' image: ', image) print(' inputs: ', inputs) print(' env: ', env) # submit task to cluster task = cluster.spawn(taskdef) if detach: printheader('detached') return def destroy(*args): print() printheader('interrupt') cluster.destroy(task.id) sys.exit(0) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) printheader('task output') for log in logs: print(log, flush=True) except ProviderError as e: printheader('error') print('Provider error:', str(e)) except TaskCreationError as e: printheader('error') print('Error creating task:', str(e)) finally: printheader()
def extract_container_taskdef(container) -> TaskDefinition: for env in container.attrs['Config']['Env']: if ENV_TASK_DEFINITION == env[0:len(ENV_TASK_DEFINITION)]: data = env[len(ENV_TASK_DEFINITION) + 1:] return TaskDefinition(**env_unpack(data)) raise Exception('Unable to unpack container task definition')
def spawn(self, taskdef: TaskDefinition) -> RemoteTask: try: task = self.rpc('spawn', **taskdef.serialize()) return RemoteTask(TaskDefinition.deserialize(task), self) except RpcError as e: raise TaskCreationError(str(e))
def test( config: Config, cluster_name: str = None, mount: bool = True, cpu: str = None, cpu_limit: str = None, memory: str = None, memory_limit: str = None, marks: str = None, verbose: bool = None, capture: bool = None, ): logger = TestLogger() try: context = Context.open(config) cluster = context.get_cluster(cluster_name) volumes = {} if mount and cluster.type == 'docker': # when testing in docker, mount the local directory # this avoids the problem of having to constantly rebuild in order to test print('** Mounting', context.root_path) volumes['/var/task'] = { 'bind': { 'src': context.root_path, 'mode': 'rw', 'inherit': 'same-image', }, } # execute the test task within the current image task = cluster.spawn( TaskDefinition( name='cowait.test', image=context.image, owner=getpass.getuser(), env={ **context.environment, **context.dotenv, }, volumes={ **context.get('volumes', {}), **volumes, }, inputs={ 'marks': marks, 'verbose': verbose, 'capture': capture, }, cpu=context.override('cpu', cpu), cpu_limit=context.override('cpu_limit', cpu_limit), memory=context.override('memory', memory), memory_limit=context.override('memory_limit', memory_limit), )) def destroy(*args): logger.header('interrupt') cluster.destroy(task.id) sys.exit(1) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task.id) logger.header('task output') for msg in logs: logger.handle(msg) logger.header() # grab task result passing = cluster.wait(task) sys.exit(0 if passing else 1) except ProviderError as e: logger.print_exception(f'Provider Error: {e}') sys.exit(1) except TaskCreationError as e: logger.print_exception(f'Error creating task: {e}') sys.exit(1)
def run( config: CowaitConfig, task: str, name: str = None, inputs: dict = {}, env: dict = {}, ports: dict = {}, routes: dict = {}, build: bool = False, upstream: str = None, detach: bool = False, cpu: str = None, cpu_limit: str = None, memory: str = None, memory_limit: str = None, raw: bool = False, quiet: bool = False, affinity: str = None, ): logger = RunLogger(raw, quiet) try: context = CowaitContext.open() cluster = config.get_cluster() # figure out image name remote_image = True image, task = parse_task_image_name(task, None) if image is None: if build: build_cmd(quiet=quiet or raw) image = context.image remote_image = False volumes = context.get('volumes', {}) if not isinstance(volumes, dict): raise TaskCreationError('Invalid volume configuration') if not remote_image: volumes['/var/task'] = { 'bind': { 'src': context.root_path, 'mode': 'rw', }, } if not affinity: affinity = {} elif affinity.lower() == 'spread': affinity = { "type": "spread", "label": { "key": task + "-key", "value": task + "-value" } } elif affinity.lower() == 'group': affinity = { "type": "group", "label": { "key": task + "-key", "value": task + "-value" } } # default to agent as upstream agent = cluster.find_agent() # create task definition taskdef = TaskDefinition( id=name, name=task, image=image, inputs=inputs, env={ **context.environment, **env, }, ports=ports, routes=routes, parent=None, # root task upstream=context.coalesce('upstream', upstream, agent), owner=getpass.getuser(), volumes=volumes, cpu=context.override('cpu', cpu), cpu_limit=context.override('cpu_limit', cpu_limit), memory=context.override('memory', memory), memory_limit=context.override('memory_limit', memory_limit), storage=context.get('storage', {}), affinity=affinity) # print execution info logger.print_info(taskdef, config.default_cluster) # submit task to cluster task = cluster.spawn(taskdef) if detach: logger.header('detached') return def destroy(*args): logger.header('interrupt') cluster.destroy(task.id) sys.exit(1) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task) logger.header('task output') for msg in logs: logger.handle(msg) logger.header() except ProviderError as e: print('Provider error:', str(e)) logger.print_exception(f'Provider Error: {e}') except TaskCreationError as e: logger.print_exception(f'Error creating task: {e}')
def env_get_task_definition(): taskdef = env_get(ENV_TASK_DEFINITION) return TaskDefinition.deserialize(taskdef)
def notebook(config, image: str = None, cluster_name: str = None) -> None: context = Context.open(config) if not context.notebook: print('Notebook funcitonaility is not enabled.') print('To enable, set features.notebook to True in cowait.yml and rebuild.') sys.exit(1) if image is not None: print('Remote images are currently not supported') sys.exit(1) volumes = { '/var/task': { 'bind': { 'src': os.getcwd(), 'mode': 'rw', 'inherit': 'same-image', }, } } cluster = context.get_cluster(cluster_name) # Docker if cluster.type == 'docker': return run_cmd( config=config, task='cowait.notebook', build=False, image=image, routes={ '/': '8888', }, cluster_name=cluster_name, volumes=volumes, ) # check for clientfs clientfs_executable = './clientfs-' + platform.system().lower() if not os.path.exists(clientfs_executable): print('Kubernetes notebooks are not supported in this build of Cowait') sys.exit(1) # Kubernetes core = client.CoreV1Api() notebook_id = 'notebook-' + uuid(4) core.create_namespaced_persistent_volume_claim( namespace=cluster.namespace, body=client.V1PersistentVolumeClaim( metadata=client.V1ObjectMeta( name=notebook_id, namespace=cluster.namespace, ), spec=client.V1PersistentVolumeClaimSpec( storage_class_name='clientfs', access_modes=['ReadWriteMany'], resources=client.V1ResourceRequirements( requests={ 'storage': '1G', }, ), ), ), ) def delete_pvc(task_id): print('destroy', task_id) if task_id != notebook_id: return print('* stopping clientfs') clientfs.terminate() print('* deleting volume') core.delete_namespaced_persistent_volume_claim(notebook_id, cluster.namespace) cluster.on('kill', delete_pvc) pvc_id = None while True: time.sleep(1) volume = core.read_namespaced_persistent_volume_claim(notebook_id, cluster.namespace) if volume.status.phase == 'Bound': pvc_id = 'pvc-' + volume.metadata.uid print('* created volume', notebook_id, '/', pvc_id) break volumes['/var/task'] = { 'persistent_volume_claim': { 'claim_name': notebook_id, }, } # start clientfs clientfs_host = cluster.args.get('clientfs', {}).get('host') print(f'* connecting clientfs volume to {clientfs_host}...') clientfs = subprocess.Popen([ clientfs_executable, f"--proxy={clientfs_host}", f"--volume={pvc_id}" ]) logger = RunLogger() try: # default to agent as upstream agent = cluster.find_agent() # create task definition taskdef = TaskDefinition( id=notebook_id, name='cowait.notebook', image=context.image, env={ **context.extend('environment', {}), **context.dotenv, }, routes={ '/': '8888', }, parent=None, # root task upstream=agent, owner=getpass.getuser(), volumes=context.extend('volumes', volumes), ) # print execution info logger.print_info(taskdef, cluster) # submit task to cluster task = cluster.spawn(taskdef) detach = False if detach: logger.header('detached') return def destroy(*args): logger.header('interrupt') cluster.destroy(task.id) sys.exit(1) with ExitTrap(destroy): # capture & print logs logs = cluster.logs(task.id) logger.header('task output') for msg in logs: logger.handle(msg) except Exception: traceback.print_exc() sys.exit(1)