def determine_cached_tag(self, build_id, base_image_id): job_id = self._job_key(build_id) try: job_data = self._orchestrator.get_key(job_id) job_data_json = json.loads(job_data) build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) except KeyError: logger.warning("Job %s does not exist in orchestrator", job_id) return None except Exception as e: logger.warning("Exception loading job from orchestrator: %s", e) return None return build_job.determine_cached_tag(base_image_id)
def _register_realm(self, realm_spec): logger.debug("Got call to register realm %s with manager", realm_spec["realm"]) # Create the build information block for the registered realm. build_job = BuildJob(AttrDict(realm_spec["job_queue_item"])) execution_id = realm_spec.get("execution_id", None) executor_name = realm_spec.get("executor_name", "EC2Executor") logger.debug("Registering realm %s with manager: %s", realm_spec["realm"], realm_spec) component = self.register_component(realm_spec["realm"], BuildComponent, token=realm_spec["token"]) build_info = BuildInfo( component=component, build_job=build_job, execution_id=execution_id, executor_name=executor_name, ) self._component_to_job[component] = build_job self._build_uuid_to_info[build_job.build_uuid] = build_info logger.debug("Registered realm %s with manager", realm_spec["realm"]) return component
def _register_realm(self, realm_spec): logger.debug('Got call to register realm %s with manager', realm_spec['realm']) # Create the build information block for the registered realm. build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) execution_id = realm_spec.get('execution_id', None) executor_name = realm_spec.get('executor_name', 'EC2Executor') logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec) component = self.register_component(realm_spec['realm'], BuildComponent, token=realm_spec['token']) build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id, executor_name=executor_name) self._component_to_job[component] = build_job self._build_uuid_to_info[build_job.build_uuid] = build_info logger.debug('Registered realm %s with manager', realm_spec['realm']) return component
async def _job_callback(self, key_change): """ This is the callback invoked when keys related to jobs are changed. It ignores all events related to the creation of new jobs. Deletes or expirations cause checks to ensure they've been properly marked as completed. :param key_change: the event and value produced by a key changing in the orchestrator :type key_change: :class:`KeyChange` """ if key_change.event in (KeyEvent.CREATE, KeyEvent.SET): return elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE): # Handle the expiration/deletion. job_metadata = json.loads(key_change.value) build_job = BuildJob(AttrDict(job_metadata["job_queue_item"])) logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid) # Get the build info. build_info = self._build_uuid_to_info.get(build_job.build_uuid, None) if build_info is None: logger.debug( 'No build info for "%s" job %s (%s); probably already deleted by this manager', key_change.event, build_job.build_uuid, job_metadata, ) return if key_change.event != KeyEvent.EXPIRE: # If the etcd action was not an expiration, then it was already deleted by some manager and # the execution was therefore already shutdown. All that's left is to remove the build info. self._build_uuid_to_info.pop(build_job.build_uuid, None) return logger.debug("got expiration for job %s with metadata: %s", build_job.build_uuid, job_metadata) if not job_metadata.get("had_heartbeat", False): # If we have not yet received a heartbeat, then the node failed to boot in some way. # We mark the job as incomplete here. await self._mark_job_incomplete(build_job, build_info) # Finally, we terminate the build execution for the job. We don't do this under a lock as # terminating a node is an atomic operation; better to make sure it is terminated than not. logger.debug( "Terminating expired build executor for job %s with execution id %s", build_job.build_uuid, build_info.execution_id, ) await self.kill_builder_executor(build_job.build_uuid) else: logger.warning("Unexpected KeyEvent (%s) on job key: %s", key_change.event, key_change.key)
def job_heartbeat(self, job_id): """Extend the processing time in the queue and updates the ttl of the job in the orchestrator. """ try: job_data = self._orchestrator.get_key(job_id) job_data_json = json.loads(job_data) build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) except KeyError: logger.warning( "Job %s no longer exists in the orchestrator, likely expired", job_id) return False except Exception as e: logger.error("Exception loading job %s from orchestrator: %s", job_id, e) return False max_expiration = datetime.utcfromtimestamp( job_data_json["max_expiration"]) max_expiration_remaining = max_expiration - datetime.utcnow() max_expiration_sec = max(1, int(max_expiration_remaining.total_seconds())) ttl = min(HEARTBEAT_PERIOD_SECONDS * 2, max_expiration_sec) # Update job expirations if (job_data_json["last_heartbeat"] and dateutil.parser.isoparse(job_data_json["last_heartbeat"]) < datetime.utcnow() - HEARTBEAT_DELTA): logger.warning( "Heartbeat expired for job %s. Marking job as expired. Last heartbeat received at %s", job_data_json["last_heartbeat"], ) self.update_job_phase(job_id, BUILD_PHASE.INTERNAL_ERROR) return False job_data_json["last_heartbeat"] = str(datetime.utcnow()) self._queue.extend_processing( build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS, minimum_extension=MINIMUM_JOB_EXTENSION, ) try: self._orchestrator.set_key(job_id, json.dumps(job_data_json), overwrite=True, expiration=ttl) except OrchestratorConnectionError: logger.error( "Could not update heartbeat for job %s. Orchestrator is not available", job_id) return False return True
def update_job_phase(self, job_id, phase, phase_metadata=None): """Updates the given job's phase and append the phase change to the buildlogs, with the given phase metadata. If the job reaches a completed state, update_job_phase also update the queue and cleanups any existing state and executors. """ try: job_data = self._orchestrator.get_key(job_id) job_data_json = json.loads(job_data) build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) except KeyError: logger.warning( "Job %s no longer exists in the orchestrator, likely expired", job_id) return False except Exception as e: logger.error("Exception loading job %s from orchestrator: %s", job_id, e) return False # Check if the build has not already reached a final phase if build_job.repo_build.phase in EphemeralBuilderManager.ARCHIVABLE_BUILD_PHASES: logger.warning( "Job %s is already in a final completed phase (%s), cannot update to %s", job_id, build_job.repo_build.phase, phase, ) return False # Update the build phase phase_metadata = phase_metadata or {} updated = model.build.update_phase_then_close(build_job.build_uuid, phase) if updated: self.append_log_message(build_job.build_uuid, phase, self._build_logs.PHASE, phase_metadata) # Check if on_job_complete needs to be called if updated and phase in EphemeralBuilderManager.COMPLETED_PHASES: executor_name = job_data_json.get("executor_name") execution_id = job_data_json.get("execution_id") if phase == BUILD_PHASE.ERROR: self.on_job_complete(build_job, BuildJobResult.ERROR, executor_name, execution_id) elif phase == BUILD_PHASE.COMPLETE: self.on_job_complete(build_job, BuildJobResult.COMPLETE, executor_name, execution_id) elif phase == BUILD_PHASE.INTERNAL_ERROR: self.on_job_complete(build_job, BuildJobResult.INCOMPLETE, executor_name, execution_id) elif phase == BUILD_PHASE.CANCELLED: self.on_job_complete(build_job, BuildJobResult.CANCELLED, executor_name, execution_id) return updated
def job_scheduled(self, job_id, control_plane, execution_id, max_startup_time): """Mark the given job as scheduled with execution id, with max_startup_time. A job is considered scheduled once a worker is started with a given registration token. """ # Get job to schedule try: job_data = self._orchestrator.get_key(job_id) job_data_json = json.loads(job_data) except KeyError: logger.warning( "Failed to mark job %s as scheduled. Job no longer exists in the orchestrator", job_id, ) return False except Exception as e: logger.warning("Exception loading job %s from orchestrator: %s", job_id, e) return False # Update build context job_data_json["executor_name"] = control_plane job_data_json["execution_id"] = execution_id try: self._orchestrator.set_key(job_id, json.dumps(job_data_json), overwrite=True, expiration=max_startup_time) except Exception as e: logger.warning("Exception updating job %s in orchestrator: %s", job_id, e) return False build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) updated = self.update_job_phase(job_id, BUILD_PHASE.BUILD_SCHEDULED) if updated: self._queue.extend_processing( build_job.job_item, seconds_from_now=max_startup_time + 60, # Add some leeway to allow the expiry event to complete minimum_extension=MINIMUM_JOB_EXTENSION, ) logger.debug( "Job scheduled for job %s with execution with ID %s on control plane %s with max startup time of %s", job_id, execution_id, control_plane, max_startup_time, ) else: logger.warning( "Job %s not scheduled. Unable update build phase to SCHEDULED", job_id) return updated
def _realm_callback(self, key_change): logger.debug("realm callback for key: %s", key_change.key) if key_change.event == KeyEvent.CREATE: # Listen on the realm created by ourselves or another worker. realm_spec = json.loads(key_change.value) self._register_realm(realm_spec) elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE): # Stop listening for new connections on the realm, if we did not get the connection. realm_spec = json.loads(key_change.value) realm_id = realm_spec["realm"] build_job = BuildJob(AttrDict(realm_spec["job_queue_item"])) build_uuid = build_job.build_uuid logger.debug("Realm key %s for build %s was %s", realm_id, build_uuid, key_change.event) build_info = self._build_uuid_to_info.get(build_uuid, None) if build_info is not None: # Pop off the component and if we find one, then the build has not connected to this # manager, so we can safely unregister its component. component = self._component_to_job.pop(build_info.component, None) if component is not None: # We were not the manager which the worker connected to, remove the bookkeeping for it logger.debug("Unregistering unused component for build %s", build_uuid) self.unregister_component(build_info.component) # If the realm has expired, then perform cleanup of the executor. if key_change.event == KeyEvent.EXPIRE: execution_id = realm_spec.get("execution_id", None) executor_name = realm_spec.get("executor_name", "EC2Executor") # Cleanup the job, since it never started. logger.debug("Job %s for incomplete marking: %s", build_uuid, build_info) if build_info is not None: yield From(self._mark_job_incomplete( build_job, build_info)) # Cleanup the executor. logger.info( "Realm %s expired for job %s, terminating executor %s with execution id %s", realm_id, build_uuid, executor_name, execution_id, ) yield From(self.terminate_executor(executor_name, execution_id)) else: logger.warning("Unexpected action (%s) on realm key: %s", key_change.event, key_change.key)
def _job_cancelled_callback(self, key_change): if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET): return job_metadata = json.loads(key_change.value) build_job = BuildJob(AttrDict(job_metadata["job_queue_item"])) executor_name = job_metadata.get("executor_name") execution_id = job_metadata.get("execution_id") job_result = BuildJobResult.CANCELLED self.on_job_complete(build_job, job_result, executor_name, execution_id)
def _build_job_from_job_id(self, job_id): """Return the BuildJob from the job id.""" try: job_data = self._orchestrator.get_key(job_id) except KeyError: raise BuildJobDoesNotExistsError(job_id) except (OrchestratorConnectionError, OrchestratorError) as oe: raise BuildJobError(oe) job_metadata = json.loads(job_data) build_job = BuildJob(AttrDict(job_metadata["job_queue_item"])) return build_job
def _job_expired_callback(self, key_change): """Callback invoked when job key is changed, except for CREATE, SET events. DELETE and EXPIRE exvents make sure the build is marked as completed and remove any state tracking, executors left. """ if key_change.event == KeyEvent.EXPIRE: job_metadata = json.loads(key_change.value) build_job = BuildJob(AttrDict(job_metadata["job_queue_item"])) executor_name = job_metadata.get("executor_name") execution_id = job_metadata.get("execution_id") job_result = BuildJobResult.EXPIRED model.build.update_phase_then_close(build_job.build_uuid, RESULT_PHASES[job_result]) self.on_job_complete(build_job, job_result, executor_name, execution_id)
def cancel_build(self, build_id): build = model.build.get_repository_build(build_id) if build.phase in EphemeralBuilderManager.PHASES_NOT_ALLOWED_TO_CANCEL_FROM: return False cancelled = model.build.update_phase_then_close(build_id, BUILD_PHASE.CANCELLED) if cancelled: try: job_data = self._orchestrator.get_key(self._job_key(build_id)) job_data_json = json.loads(job_data) build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) self.on_job_complete( build_job, BuildJobResult.CANCELLED, job_data_json.get("executor_name"), job_data_json.get("execution_id"), ) except KeyError: logger.warning( "Could not cleanup cancelled job %s. Job does not exist in orchestrator", job_id ) return cancelled
def _work_checker(self): logger.debug("Initializing work checker") while True: logger.debug("Writing queue metrics") self._queue.update_metrics() with database.CloseForLongOperation(app.config): time.sleep(WORK_CHECK_TIMEOUT) logger.debug("Checking for more work from the build queue") processing_time = EPHEMERAL_SETUP_TIMEOUT + SETUP_LEEWAY_SECONDS job_item = self._queue.get(processing_time=processing_time, ordering_required=True) if job_item is None: logger.debug( "No additional work found. Going to sleep for %s seconds", WORK_CHECK_TIMEOUT ) continue try: build_job = BuildJob(job_item) except BuildJobLoadException as bjle: logger.error( "BuildJobLoadException. Job data: %s. No retry restore. - %s", job_item.body, bjle, ) self._queue.incomplete(job_item, restore_retry=False) continue build_id = build_job.build_uuid job_id = self._job_key(build_id) try: logger.debug("Creating build job for build %s", build_id) self.create_job(build_id, {"job_queue_item": build_job.job_item}) except BuildJobAlreadyExistsError: logger.warning( "Attempted to create job %s that already exists. Cleaning up existing job and returning it to the queue.", job_id, ) self.job_unschedulable(job_id) self._queue.incomplete(job_item, restore_retry=True) continue except BuildJobError as je: logger.error("Create job exception. Build %s - %s", build_id, je) self._queue.incomplete(job_item, restore_retry=True) continue try: logger.debug("Scheduling build job %s", job_id) schedule_success, retry_timeout = self.schedule(build_id) except Exception as se: logger.exception("Exception when scheduling job %s: %s", build_job.build_uuid, se) self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) continue if schedule_success: logger.debug("Build job %s scheduled.", job_id) else: logger.warning( "Unsuccessful schedule. Build ID: %s. Retry restored.", build_job.repo_build.uuid, ) self.job_unschedulable(job_id) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
def start_job(self, job_id, max_build_time): """Starts the build job. This is invoked by the worker once the job has been created and scheduled, returing the buildpack needed to start the actual build. """ try: job_data = self._orchestrator.get_key(job_id) job_data_json = json.loads(job_data) build_job = BuildJob(AttrDict(job_data_json["job_queue_item"])) except KeyError: logger.warning("Failed to start job %s. Job does not exists in orchestrator", job_id) return None, None except Exception as e: logger.error("Exception loading job %s from orchestrator: %s", job_id, e) return None, None # Construct the buildpack repo = build_job.repo_build.repository repository_name = repo.namespace_user.username + "/" + repo.name context, dockerfile_path = build_job.extract_dockerfile_args() base_image_information = {} if build_job.pull_credentials: base_image_information["username"] = build_job.pull_credentials.get("username", "") base_image_information["password"] = build_job.pull_credentials.get("password", "") build_args = { "build_package": build_job.get_build_package_url(self._user_files), "context": context, "dockerfile_path": dockerfile_path, "repository": repository_name, "registry": self._registry_hostname, "pull_token": build_job.repo_build.access_token.get_code(), "push_token": build_job.repo_build.access_token.get_code(), "tag_names": build_job.build_config.get("docker_tags", ["latest"]), "base_image": base_image_information, } private_key = None if ( build_job.repo_build.trigger is not None and build_job.repo_build.trigger.secure_private_key is not None ): private_key = build_job.repo_build.trigger.secure_private_key.decrypt() if private_key is not None: build_args["git"] = { "url": build_job.build_config["trigger_metadata"].get("git_url", ""), "sha": build_job.commit_sha(), "private_key": private_key or "", } # If the build args have no buildpack, mark it as a failure before sending # it to a builder instance. if not build_args["build_package"] and not build_args["git"]: logger.error( "Failed to start job %s: insufficient build args - No package url or git", job_id, ) self.update_job_phase(job_id, BUILD_PHASE.INTERNAL_ERROR) return (None, None) # Generate the build token token = self.generate_build_token( BUILD_JOB_TOKEN_TYPE, build_job.build_uuid, job_id, max_build_time ) # Publish the time it took for a worker to ack the build self._write_duration_metric(build_ack_duration, build_job.build_uuid) logger.debug("Started build job %s with arguments %s", job_id, build_args) return (token, build_args)
async def _work_checker(self): logger.debug("Initializing work checker") while self._current_status == BuildServerStatus.RUNNING: with database.CloseForLongOperation(app.config): await asyncio.sleep(WORK_CHECK_TIMEOUT) logger.debug( "Checking for more work for %d active workers", self._lifecycle_manager.num_workers(), ) processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS job_item = self._queue.get(processing_time=processing_time, ordering_required=True) if job_item is None: logger.debug( "No additional work found. Going to sleep for %s seconds", WORK_CHECK_TIMEOUT ) continue try: build_job = BuildJob(job_item) except BuildJobLoadException as irbe: logger.warning( "[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.", job_item.body, ) logger.exception(irbe) self._queue.incomplete(job_item, restore_retry=False) continue logger.debug( "Checking for an avaliable worker for build job %s", build_job.repo_build.uuid ) try: schedule_success, retry_timeout = await self._lifecycle_manager.schedule(build_job) except: logger.warning( "[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.", build_job.repo_build.uuid, ) logger.exception("Exception when scheduling job: %s", build_job.repo_build.uuid) self._current_status = BuildServerStatus.EXCEPTION self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) return if schedule_success: logger.debug("Marking build %s as scheduled", build_job.repo_build.uuid) status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) await status_handler.set_phase(database.BUILD_PHASE.BUILD_SCHEDULED) self._job_count = self._job_count + 1 logger.debug( "Build job %s scheduled. Running: %s", build_job.repo_build.uuid, self._job_count, ) else: logger.warning( "[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.", build_job.repo_build.uuid, ) logger.debug( "All workers are busy for job %s Requeuing after %s seconds.", build_job.repo_build.uuid, retry_timeout, ) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)