def delete_pytorch_job(self, name, namespace): """Delete the provided PyTorchJob in the specified namespace. :param name: The custom object :param namespace: The custom resource :returns: object: The deleted PyTorchJob. """ pytorchjob_client = PyTorchJobClient() return pytorchjob_client.delete(name, namespace=namespace)
def delete_pytorch_job(self, name, namespace): """Delete the provided PyTorchJob in the specified namespace. :param name: The custom object :param namespace: The custom resource :returns: object: The deleted PyTorchJob. """ pytorchjob_client = PyTorchJobClient( config_file=self.config_file, context=self.context, client_configuration=self.client_configuration, persist_config=self.persist_config) return pytorchjob_client.delete(name, namespace=namespace)
def create_pytorch_job(self, namespace, pytorchjob): """Create the provided PyTorchJob in the specified namespace. The PyTorchJob version is defined in PYTORCH_JOB_VERSION in kubeflow.pytorch.constants. The version PyTorchJob need to be installed before creating the PyTorchJob. :param namespace: The custom resource :param pytorchjob: The JSON schema of the Resource to create :returns: object: Created TFJob. """ pytorchjob_client = PyTorchJobClient() try: return pytorchjob_client.create(pytorchjob, namespace=namespace) except client.rest.ApiException: raise RuntimeError( "Failed to create PyTorchJob. Perhaps the CRD PyTorchJob version " "{} in not installed(If you use different version you can pass it " "as ENV variable called `PYTORCH_JOB_VERSION`)? ".format( constants.PYTORCH_JOB_VERSION))
import os from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta from kubernetes.client import V1PodSpec from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements from kubeflow.pytorchjob import constants from kubeflow.pytorchjob import utils from kubeflow.pytorchjob import V1ReplicaSpec from kubeflow.pytorchjob import V1PyTorchJob from kubeflow.pytorchjob import V1PyTorchJobSpec from kubeflow.pytorchjob import PyTorchJobClient PYTORCH_CLIENT = PyTorchJobClient( config_file=os.getenv('KUBECONFIG', '~/.kube/config')) def wait_for_pytorchjob_ready(name, namespace='default', timeout_seconds=600): for _ in range(round(timeout_seconds / 10)): time.sleep(10) pytorchjob = PYTORCH_CLIENT.get(name, namespace=namespace) last_condition = pytorchjob.get("status", {}).get("conditions", [])[-1] last_status = last_condition.get("type", "").lower() if last_status == "succeeded": return elif last_status == "failed": raise RuntimeError("The PyTorchJob is failed.") else:
def launch_job(client: PyTorchJobClient, job: V1PyTorchJob): """ Launch PyTorchJob on kubeflow pipeline """ ret = client.create(job) # type: V1PyTorchJob LOGGER.info('Launch PyTorchJob %s', ret) job_name = ret['metadata']['name'] namespace = ret['metadata']['namespace'] LOGGER.debug('setup sigterm handler') delete_job_func = _get_delete_pytorch_job_func(client, job_name, namespace) signal.signal(signal.SIGTERM, delete_job_func) job = client.wait_for_condition(job_name, ['Created', 'Failed'], namespace=namespace, status_callback=lambda x: LOGGER.debug( 'PyTorchJob Conditions\n %s', x.get("status", {}).get("conditions", ['None Condition'])[-1])) if job.get("status", {}).get("conditions", [])[0]['type'] == 'Failed': LOGGER.error('Cancel PytorchJob: %s', job_name) LOGGER.error('Unexpected condition. Could you confirm below ?') LOGGER.error(job) sys.exit(1) LOGGER.info('PyTorchJob created: %s', job_name) for _pname in client.get_pod_names(job_name, namespace=namespace): LOGGER.info('Pod name: %s', _pname) master_pod_name = list(client.get_pod_names(job_name, namespace=namespace, master=True))[0] master_pod = client.core_api.read_namespaced_pod(master_pod_name, namespace, pretty='true') LOGGER.debug('master pod spec') LOGGER.debug(master_pod) labels = utils.get_labels(job_name, master=True) LOGGER.info('wait till pod running. target selector: %s', labels) w = watch.Watch() last_pod_info = None for event in w.stream(client.core_api.list_namespaced_pod, namespace, label_selector=utils.to_selector(labels)): last_pod_info = event['object'] # type: V1Pod LOGGER.debug("Event: %s %s %s %s", event['type'], last_pod_info.metadata.name, last_pod_info.status.phase, last_pod_info.status.conditions[-1] if len(last_pod_info.status.conditions) > 0 else 'none') if last_pod_info.status.phase in ['Succeeded', 'Failed', 'Unknown'] or (last_pod_info.status.phase == 'Running' and len(last_pod_info.status.conditions) > 0 and last_pod_info.status.conditions[-1].type == 'PodScheduled'): w.stop() if last_pod_info.status.phase in ['Failed', 'Unknown']: LOGGER.error('Cancel PytorchJob: %s', job_name) LOGGER.error('master pod status: %s', last_pod_info.status.phase) LOGGER.error('Could you confirm below ?') LOGGER.error(last_pod_info) sys.exit(1) LOGGER.info('start watch PyTorchJob Pods log') for line in client.core_api.read_namespaced_pod_log(master_pod_name, namespace, container='pytorch', follow=True, _preload_content=False).stream(): LOGGER.info(line.decode('utf-8')[:-1]) client.wait_for_job(job_name, namespace=namespace) LOGGER.info('Delete PyTorchJob') delete_job_func() LOGGER.info('Launched job finished')
default_job_name = job_name_format.format(yyyyMMdd=today.strftime('%Y%m%d'), random_str=''.join(random.choices(string.ascii_lowercase + string.digits, k=8))) parser = argparse.ArgumentParser( description='PyTorchJob Launcher') parser.add_argument('--pod-spec', type=yamlOrJsonStr, required=True, help='Specify V1PodSpec by yaml or json. This spec be passed as is api') parser.add_argument('--job_name', type=str, default=default_job_name, help='Specify PyTorchJob name. default format is [{}]'.format(job_name_format)) parser.add_argument('--worker-num', type=int, default=0, help='Specify worker count. if 0 set, execute on master only.') parser.add_argument('--namespace', type=str, default='kubeflow', help='Specify namespace executed') parser.add_argument('--log-level', default='DEBUG', choices=['DEBUG', 'INFO']) args = parser.parse_args() logging.getLogger().setLevel(getattr(logging, args.log_level)) args = vars(args) del args['log_level'] LOGGER.debug(args) try: job = _create_pytorchjob(**args) launch_job(PyTorchJobClient(), job) except Exception as e: LOGGER.exception('Unexpected Error') raise e