def build_jobs_stop(self, project_name, project_uuid, build_job_name, build_job_uuid, update_status=True): deleted = dockerizer_scheduler.stop_dockerizer( project_name=project_name, project_uuid=project_uuid, build_job_name=build_job_name, build_job_uuid=build_job_uuid) if not deleted and self.request.retries < 2: _logger.info('Trying again to delete build `%s`.', build_job_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return build_job = get_valid_build_job(build_job_uuid=build_job_uuid) if not build_job: _logger.info( 'Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_uuid) return # Update build job status to show that its stopped build_job.set_status(status=JobLifeCycle.STOPPED, message='BuildJob was stopped.')
def build_jobs_schedule_deletion(build_job_id, immediate=False): build_job = get_valid_build_job(build_job_id=build_job_id, include_deleted=True) if not build_job: _logger.info( 'Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return build_job.archive() if build_job.is_stoppable: project = build_job.project celery_app.send_task(SchedulerCeleryTasks.BUILD_JOBS_STOP, kwargs={ 'project_name': project.unique_name, 'project_uuid': project.uuid.hex, 'build_job_name': build_job.unique_name, 'build_job_uuid': build_job.uuid.hex, 'update_status': True, 'collect_logs': False, 'message': 'Build is scheduled for deletion.' }) if immediate: celery_app.send_task(SchedulerCeleryTasks.DELETE_ARCHIVED_BUILD_JOB, kwargs={ 'job_id': build_job_id, })
def build_jobs_stop(build_job_id, update_status=True): build_job = get_valid_build_job(build_job_id=build_job_id) if not build_job: _logger.info('Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return dockerizer_scheduler.stop_dockerizer(build_job, update_status=update_status)
def build_jobs_start(build_job_id): build_job = get_valid_build_job(build_job_id=build_job_id) if not build_job: _logger.info('Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return dockerizer_scheduler.start_dockerizer(build_job)
def build_jobs_set_dockerfile(build_job_uuid, dockerfile): build_job = get_valid_build_job(build_job_uuid=build_job_uuid) if not build_job: _logger.info('Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_uuid) return build_job.dockerfile = dockerfile build_job.save()
def build_jobs_check_heartbeat(build_job_id): if RedisHeartBeat.build_is_alive(build_id=build_job_id): return build_job = get_valid_build_job(build_job_id=build_job_id) if not build_job: return # BuildJob is zombie status build_job.set_status(JobLifeCycle.FAILED, message='BuildJob is in zombie state (no heartbeat was reported).')
def build_jobs_stop(self, project_name, project_uuid, build_job_name, build_job_uuid, update_status=True, collect_logs=True, is_managed=True, message=None): if collect_logs and is_managed: try: logs_collect_build_job(build_uuid=build_job_uuid) except (OSError, VolumeNotFoundError, PolyaxonStoresException): _logger.warning( 'Scheduler could not collect the logs for build `%s`.', build_job_name) if is_managed: deleted = dockerizer_scheduler.stop_dockerizer( project_name=project_name, project_uuid=project_uuid, build_job_name=build_job_name, build_job_uuid=build_job_uuid) else: deleted = True if not deleted and self.request.retries < 2: _logger.info('Trying again to delete build `%s`.', build_job_name) self.retry(countdown=Intervals.EXPERIMENTS_SCHEDULER) return if not update_status: return build_job = get_valid_build_job(build_job_uuid=build_job_uuid, include_deleted=True) if not build_job: _logger.info( 'Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_uuid) return # Update build job status to show that its stopped build_job.set_status(status=JobLifeCycle.STOPPED, message=message or 'BuildJob was stopped.')
def build_jobs_stop(project_name, project_uuid, build_job_name, build_job_uuid, update_status=True): dockerizer_scheduler.stop_dockerizer( project_name=project_name, project_uuid=project_uuid, build_job_name=build_job_name, build_job_uuid=build_job_uuid) if not update_status: return build_job = get_valid_build_job(build_job_uuid=build_job_uuid) if not build_job: _logger.info('Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_uuid) return # Update build job status to show that its stopped build_job.set_status(status=JobLifeCycle.STOPPED, message='BuildJob was stopped.')
def build_jobs_notify_done(build_job_id): build_job = get_valid_build_job(build_job_id=build_job_id) if not build_job: _logger.info('Something went wrong, ' 'the BuildJob `%s` does not exist anymore.', build_job_id) return # Notify all dependent jobs, notebooks, tensorboards, and experiments # Build job Failed -> Set status Failed with message: build failed if build_job.failed: notify_build_job_failed(build_job) return # Build job Stopped -> Stop the dependent jobs if build_job.stopped: notify_build_job_stopped(build_job) return # Build job Succeeded -> Start the dependent jobs if build_job.succeeded: notify_build_job_succeeded(build_job)