Example #1
0
    def delete_pytorch_job(self, name, namespace):
        """Delete the provided PyTorchJob in the specified namespace.

        :param name: The custom object
        :param namespace: The custom resource
        :returns: object: The deleted PyTorchJob.

        """
        pytorchjob_client = PyTorchJobClient()
        return pytorchjob_client.delete(name, namespace=namespace)
Example #2
0
    def delete_pytorch_job(self, name, namespace):
        """Delete the provided PyTorchJob in the specified namespace.

        :param name: The custom object
        :param namespace: The custom resource
        :returns: object: The deleted PyTorchJob.

        """
        pytorchjob_client = PyTorchJobClient(
            config_file=self.config_file,
            context=self.context,
            client_configuration=self.client_configuration,
            persist_config=self.persist_config)
        return pytorchjob_client.delete(name, namespace=namespace)
Example #3
0
    def create_pytorch_job(self, namespace, pytorchjob):
        """Create the provided PyTorchJob in the specified namespace.
        The PyTorchJob version is defined in PYTORCH_JOB_VERSION in kubeflow.pytorch.constants.
        The version PyTorchJob need to be installed before creating the PyTorchJob.

        :param namespace: The custom resource
        :param pytorchjob: The JSON schema of the Resource to create
        :returns: object: Created TFJob.

        """
        pytorchjob_client = PyTorchJobClient()
        try:
            return pytorchjob_client.create(pytorchjob, namespace=namespace)
        except client.rest.ApiException:
            raise RuntimeError(
                "Failed to create PyTorchJob. Perhaps the CRD PyTorchJob version "
                "{} in not installed(If you use different version you can pass it "
                "as ENV variable called `PYTORCH_JOB_VERSION`)? ".format(
                    constants.PYTORCH_JOB_VERSION))
Example #4
0
import os

from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements

from kubeflow.pytorchjob import constants
from kubeflow.pytorchjob import utils
from kubeflow.pytorchjob import V1ReplicaSpec
from kubeflow.pytorchjob import V1PyTorchJob
from kubeflow.pytorchjob import V1PyTorchJobSpec
from kubeflow.pytorchjob import PyTorchJobClient

PYTORCH_CLIENT = PyTorchJobClient(
    config_file=os.getenv('KUBECONFIG', '~/.kube/config'))


def wait_for_pytorchjob_ready(name, namespace='default', timeout_seconds=600):
    for _ in range(round(timeout_seconds / 10)):
        time.sleep(10)
        pytorchjob = PYTORCH_CLIENT.get(name, namespace=namespace)

        last_condition = pytorchjob.get("status", {}).get("conditions", [])[-1]
        last_status = last_condition.get("type", "").lower()

        if last_status == "succeeded":
            return
        elif last_status == "failed":
            raise RuntimeError("The PyTorchJob is failed.")
        else:
Example #5
0
def launch_job(client: PyTorchJobClient, job: V1PyTorchJob):
    """
    Launch PyTorchJob on kubeflow pipeline

    """

    ret = client.create(job)  # type: V1PyTorchJob
    LOGGER.info('Launch PyTorchJob %s', ret)
    job_name = ret['metadata']['name']
    namespace = ret['metadata']['namespace']

    LOGGER.debug('setup sigterm handler')
    delete_job_func = _get_delete_pytorch_job_func(client, job_name, namespace)
    signal.signal(signal.SIGTERM, delete_job_func)

    job = client.wait_for_condition(job_name, ['Created', 'Failed'], namespace=namespace, status_callback=lambda x: LOGGER.debug(
        'PyTorchJob Conditions\n %s', x.get("status", {}).get("conditions", ['None Condition'])[-1]))

    if job.get("status", {}).get("conditions", [])[0]['type'] == 'Failed':
        LOGGER.error('Cancel PytorchJob: %s', job_name)
        LOGGER.error('Unexpected condition. Could you confirm below ?')
        LOGGER.error(job)

        sys.exit(1)

    LOGGER.info('PyTorchJob created: %s', job_name)
    for _pname in client.get_pod_names(job_name, namespace=namespace):
        LOGGER.info('Pod name: %s', _pname)

    master_pod_name = list(client.get_pod_names(job_name, namespace=namespace, master=True))[0]
    master_pod = client.core_api.read_namespaced_pod(master_pod_name, namespace, pretty='true')

    LOGGER.debug('master pod spec')
    LOGGER.debug(master_pod)

    labels = utils.get_labels(job_name, master=True)
    LOGGER.info('wait till pod running. target selector: %s', labels)
    w = watch.Watch()
    last_pod_info = None
    for event in w.stream(client.core_api.list_namespaced_pod,
                          namespace,
                          label_selector=utils.to_selector(labels)):
        last_pod_info = event['object']  # type: V1Pod
        LOGGER.debug("Event: %s %s %s %s",
                     event['type'],
                     last_pod_info.metadata.name,
                     last_pod_info.status.phase,
                     last_pod_info.status.conditions[-1] if len(last_pod_info.status.conditions) > 0 else 'none')

        if last_pod_info.status.phase in ['Succeeded', 'Failed', 'Unknown'] or (last_pod_info.status.phase == 'Running' and
                                                                                len(last_pod_info.status.conditions) > 0 and
                                                                                last_pod_info.status.conditions[-1].type == 'PodScheduled'):
            w.stop()

    if last_pod_info.status.phase in ['Failed', 'Unknown']:
        LOGGER.error('Cancel PytorchJob: %s', job_name)
        LOGGER.error('master pod status: %s', last_pod_info.status.phase)
        LOGGER.error('Could you confirm below ?')
        LOGGER.error(last_pod_info)

        sys.exit(1)

    LOGGER.info('start watch PyTorchJob Pods log')
    for line in client.core_api.read_namespaced_pod_log(master_pod_name,
                                                        namespace,
                                                        container='pytorch',
                                                        follow=True,
                                                        _preload_content=False).stream():
        LOGGER.info(line.decode('utf-8')[:-1])

    client.wait_for_job(job_name, namespace=namespace)

    LOGGER.info('Delete PyTorchJob')
    delete_job_func()

    LOGGER.info('Launched job finished')
Example #6
0
    default_job_name = job_name_format.format(yyyyMMdd=today.strftime('%Y%m%d'),
                                              random_str=''.join(random.choices(string.ascii_lowercase + string.digits, k=8)))

    parser = argparse.ArgumentParser(
        description='PyTorchJob Launcher')
    parser.add_argument('--pod-spec', type=yamlOrJsonStr, required=True,
                        help='Specify V1PodSpec by yaml or json. This spec be passed as is api')

    parser.add_argument('--job_name', type=str, default=default_job_name,
                        help='Specify PyTorchJob name. default format is [{}]'.format(job_name_format))
    parser.add_argument('--worker-num', type=int, default=0, help='Specify worker count. if 0 set, execute on master only.')
    parser.add_argument('--namespace', type=str, default='kubeflow', help='Specify namespace executed')

    parser.add_argument('--log-level', default='DEBUG', choices=['DEBUG', 'INFO'])

    args = parser.parse_args()

    logging.getLogger().setLevel(getattr(logging, args.log_level))

    args = vars(args)
    del args['log_level']

    LOGGER.debug(args)

    try:
        job = _create_pytorchjob(**args)
        launch_job(PyTorchJobClient(), job)
    except Exception as e:
        LOGGER.exception('Unexpected Error')
        raise e