コード例 #1
0
def start_tensorboard(tensorboard):
    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=settings.K8S_CONFIG,
                                 namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)

    try:
        results = spawner.start_tensorboard(
            image=tensorboard.image,
            resources=tensorboard.resources,
            node_selectors=tensorboard.node_selectors)
    except ApiException as e:
        logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start tensorboard, encountered a Kubernetes ApiException.'
        )
        return
    except Exception as e:
        logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message='Could not start tensorboard encountered an {} exception.'.
            format(e.__class__.__name__))
        return
    tensorboard.definition = get_job_definition(results)
    tensorboard.save()
コード例 #2
0
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(
        project_name=tensorboard.project.unique_name,
        project_uuid=tensorboard.project.uuid.hex,
        job_name=tensorboard.unique_name,
        job_uuid=tensorboard.uuid.hex,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        in_cluster=True)

    error = {}
    try:
        results = spawner.start_tensorboard(
            image=tensorboard.image,
            outputs_path=tensorboard.outputs_path,
            persistence_outputs=tensorboard.persistence_outputs,
            outputs_refs_jobs=tensorboard.outputs_refs_jobs,
            outputs_refs_experiments=tensorboard.outputs_refs_experiments,
            resources=tensorboard.resources,
            node_selector=tensorboard.node_selector,
            affinity=tensorboard.affinity,
            tolerations=tensorboard.tolerations)
        tensorboard.definition = get_job_definition(results)
        tensorboard.save()
        return
    except ApiException:
        _logger.error('Could not start tensorboard, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised': True,
            'traceback': traceback.format_exc(),
            'message': 'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error('Could not start the tensorboard, please check your volume definitions.',
                      exc_info=True)
        error = {
            'raised': True,
            'traceback': traceback.format_exc(),
            'message': 'Could not start the job, encountered a volume definition problem. %s' % e,
        }
    except Exception as e:
        _logger.error('Could not start tensorboard, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised': True,
            'traceback': traceback.format_exc(),
            'message': 'Could not start tensorboard encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            tensorboard.set_status(
                JobLifeCycle.FAILED,
                message=error.get('message'),
                traceback=error.get('traceback'))
コード例 #3
0
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=settings.K8S_CONFIG,
                                 namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)

    try:
        node_selectors = get_node_selector(
            node_selector=tensorboard.node_selectors,
            default_node_selector=settings.NODE_SELECTORS_EXPERIMENTS)
        results = spawner.start_tensorboard(
            image=tensorboard.image,
            outputs_path=tensorboard.outputs_path,
            persistence_outputs=tensorboard.persistence_outputs,
            outputs_refs_jobs=tensorboard.outputs_refs_jobs,
            outputs_refs_experiments=tensorboard.outputs_refs_experiments,
            resources=tensorboard.resources,
            node_selectors=node_selectors)
    except ApiException as e:
        _logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start tensorboard, encountered a Kubernetes ApiException.'
        )
        return
    except VolumeNotFoundError as e:
        _logger.warning(
            'Could not start the tensorboard, '
            'please check your volume definitions %s', e)
        tensorboard.set_status(JobLifeCycle.FAILED,
                               message='Could not start the tensorboard, '
                               'encountered a volume definition problem. %s' %
                               e)
        return False
    except Exception as e:
        _logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message='Could not start tensorboard encountered an {} exception.'.
            format(e.__class__.__name__))
        return
    tensorboard.definition = get_job_definition(results)
    tensorboard.save()
コード例 #4
0
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=conf.get(K8S_CONFIG),
                                 namespace=conf.get(K8S_NAMESPACE),
                                 job_docker_image=tensorboard.build_image,
                                 in_cluster=True)

    error = {}
    outputs_specs, tensorboard_paths = tensorboard.outputs_path
    try:
        results = spawner.start_tensorboard(
            outputs_path=tensorboard_paths,
            persistence_outputs=tensorboard.persistence_outputs,
            outputs_specs=outputs_specs,
            outputs_refs_jobs=tensorboard.outputs_refs_jobs,
            outputs_refs_experiments=tensorboard.outputs_refs_experiments,
            resources=tensorboard.resources,  # TODO: resources
            node_selector=tensorboard.node_selector,
            affinity=tensorboard.affinity,
            tolerations=tensorboard.tolerations)
        tensorboard.definition = get_job_definition(results)
        tensorboard.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the tensorboard, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except TensorboardValidation as e:
        _logger.error(
            'Could not start the tensorboard, '
            'some experiments require authenticating to stores with different access.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            None,
            'message':
            'Could not start the tensorboard, '
            'some experiments require authenticating '
            'to stores with different access. %s' % e,
        }
    except Exception as e:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start tensorboard encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            tensorboard.set_status(JobLifeCycle.FAILED,
                                   message=error.get('message'),
                                   traceback=error.get('traceback'))