def cleanup( work: str, store: Store, sched: Optional[TaskManager] = None, purge_db: bool = False, purge_fs: bool = False, data_dir: Optional[str] = None, ) -> List[Cleaned]: """Cleanup a job :param work: The name of the job :param store: The job store :param sched: The scheduler used to kill jobs. :param purge_db: Should the pipeline also be removed from the job db. :param purge_fs: Should the pipeline also be removed from the file system. :param data_dir: A directory that combined with work is where all artifacts produced by the pipeline live. :returns: The list jobs and if it was removed from k8s or the job db. """ if not work: return [] sched = sched if sched else KubernetesTaskManager(store) parent_details = store.get(work) children = list( chain(parent_details[Store.EXECUTED], parent_details[Store.EXECUTING])) cleaned = set() purged = set() removed = set() if purge_fs: if data_dir is None: LOGGER.warning( "Requested removal from the file system but no data directory provided." ) else: shutil.rmtree(os.path.join(data_dir, work), ignore_errors=True) removed = set(chain([work], children)) for job in children: try: sched.kill(job) cleaned.add(job) except: # pylint: disable=bare-except pass if purge_db: if store.remove(job): purged.add(job) # Remove the work entry from the db last so if there is an error before hand we can still use the db entry. if purge_db: if store.remove(work): purged.add(work) return [ Cleaned(j, done(j, cleaned), done(j, purged), done(j, removed)) for j in chain([work], children) ]
def submit(self, task: Task) -> str: """Submit a multi-worker PyTorchJob Task :param task: The task definition :type task: Task :return: A string handle name :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) task.num_gpus = 1 pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps) template_metadata = client.V1ObjectMeta(name=task.name) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) worker_replica_spec = {} worker_replica_spec['replicas'] = task.num_workers worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE worker_replica_spec['template'] = template spec = {} spec['replicaSpecs'] = {} spec['replicaSpecs']['Worker'] = worker_replica_spec spec['minReplicas'] = task.num_workers spec['maxReplicas'] = task.num_workers etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC') if not etcd_svc: LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace") api = client.CoreV1Api() etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT) spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}' pytorch_job_spec = {} pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}' pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name) pytorch_job_spec['spec'] = spec pytorch_job = self.api.create_namespaced_custom_object( PyTorchElasticJobHandler.GROUP, PyTorchElasticJobHandler.VERSION, self.namespace, PyTorchElasticJobHandler.PLURAL, pytorch_job_spec, ) return pytorch_job['metadata']['name']
def main(): """Use `asyncio` to connect to a websocket and request a pipeline, wait. """ signal.signal(signal.SIGINT, lambda *args, **kwargs: exit(0)) parser = argparse.ArgumentParser( description='HTTP or Websocket-based Pipeline scheduler') parser.add_argument('work', help='Job') parser.add_argument('--host', default=ODIN_URL, type=str) parser.add_argument('--port', default=ODIN_PORT) parser.add_argument('--token', help="File where JWT token can reside", default=os.path.expanduser("~/.odin.token")) parser.add_argument('--username', '-u', help="Username", default=getuser()) parser.add_argument('--password', '-p', help="Password") parser.add_argument( '--scheme', choices={'http', 'wss', 'ws', 'https'}, default=ODIN_SCHEME, help= 'Connection protocol, use `http` for REST, use `wss` for remote connections and `ws` for localhost', ) args, overrides = parser.parse_known_args() context = parse_and_merge_overrides({}, overrides, pre='x') url = f'{args.scheme}://{args.host}:{args.port}' if args.scheme.startswith('ws'): if context: LOGGER.warning("Context is ignored by web-socket tier") asyncio.get_event_loop().run_until_complete( schedule_pipeline(url, args.work)) else: jwt_token = get_jwt_token(url, args.token, args.username, args.password) try: schedule_pipeline_http(url, jwt_token, args.work, context) except ValueError: # Try deleting the token file and start again if os.path.exists(args.token): os.remove(args.token) jwt_token = get_jwt_token(url, args.token, args.username, args.password) schedule_pipeline_http(url, jwt_token, args.work, context)
def expand_dirs(files: List[str]) -> List[str]: """Given a list of files and dirs return a list all files in the dir. :param files: The list of files and dirs. :returns: The list with dirs expanded into the files contained within them. """ new_files = [] for f in files: f = os.path.expanduser(f) if not os.path.exists(f): LOGGER.warning("Requested hash of %s but file not found.", f) continue if os.path.isdir(f): new_files.extend(expand_dir(f)) else: new_files.append(f) return new_files
def _reference_secrets(self, task: Task) -> Optional[List[Secret]]: """Generate secrets based on the requirements of the job. Eventually we can support custom secrets by having the job create secrets from the yaml config. Then this function will combine secrets on the job with these injected secrets to yield the final full list. :param task: The job we are running to add secrets to. :type task: Task :returns: A list of Secrets or `None` :rtype: Optional[List[Secret]] """ secrets = task.secrets if task.secrets is not None else [] command = listify(task.command) if command[0].startswith('odin'): try: # Check if the odin-cred secret exists _ = self.core_api.read_namespaced_secret( name=ODIN_CRED, namespace=self.namespace) cred_secret = Secret(os.path.join(SECRET_LOC, ODIN_CRED_FILE), ODIN_CRED, ODIN_CRED_FILE) # Make sure they aren't already requesting this secret if not any(s == cred_secret for s in secrets): secrets.append(cred_secret) except client.rest.ApiException: if '--cred' not in task.args: LOGGER.warning( 'No --cred arg found on job %s and no odin-cred secret found to populate container.', task.name) if command[0].startswith('odin-chores'): try: # Check if the ssh-key secret exists _ = self.core_api.read_namespaced_secret( name=SSH_KEY, namespace=self.namespace) # Make the key permissions -rw------- ssh_secret = Secret(os.path.join(SECRET_LOC, SSH_KEY_FILE), SSH_KEY, SSH_KEY_FILE, SSH_MODE) # Make sure they aren't already requesting this secret if not any(s == ssh_secret for s in secrets): secrets.append(ssh_secret) except client.rest.ApiException: pass return secrets if secrets else None