Example #1
0
    def determine_cached_tag(self, build_id, base_image_id):
        job_id = self._job_key(build_id)
        try:
            job_data = self._orchestrator.get_key(job_id)
            job_data_json = json.loads(job_data)
            build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
        except KeyError:
            logger.warning("Job %s does not exist in orchestrator", job_id)
            return None
        except Exception as e:
            logger.warning("Exception loading job from orchestrator: %s", e)
            return None

        return build_job.determine_cached_tag(base_image_id)
Example #2
0
    def _register_realm(self, realm_spec):
        logger.debug("Got call to register realm %s with manager",
                     realm_spec["realm"])

        # Create the build information block for the registered realm.
        build_job = BuildJob(AttrDict(realm_spec["job_queue_item"]))
        execution_id = realm_spec.get("execution_id", None)
        executor_name = realm_spec.get("executor_name", "EC2Executor")

        logger.debug("Registering realm %s with manager: %s",
                     realm_spec["realm"], realm_spec)
        component = self.register_component(realm_spec["realm"],
                                            BuildComponent,
                                            token=realm_spec["token"])

        build_info = BuildInfo(
            component=component,
            build_job=build_job,
            execution_id=execution_id,
            executor_name=executor_name,
        )

        self._component_to_job[component] = build_job
        self._build_uuid_to_info[build_job.build_uuid] = build_info

        logger.debug("Registered realm %s with manager", realm_spec["realm"])
        return component
Example #3
0
    def _register_realm(self, realm_spec):
        logger.debug('Got call to register realm %s with manager',
                     realm_spec['realm'])

        # Create the build information block for the registered realm.
        build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
        execution_id = realm_spec.get('execution_id', None)
        executor_name = realm_spec.get('executor_name', 'EC2Executor')

        logger.debug('Registering realm %s with manager: %s',
                     realm_spec['realm'], realm_spec)
        component = self.register_component(realm_spec['realm'],
                                            BuildComponent,
                                            token=realm_spec['token'])

        build_info = BuildInfo(component=component,
                               build_job=build_job,
                               execution_id=execution_id,
                               executor_name=executor_name)

        self._component_to_job[component] = build_job
        self._build_uuid_to_info[build_job.build_uuid] = build_info

        logger.debug('Registered realm %s with manager', realm_spec['realm'])
        return component
Example #4
0
    async def _job_callback(self, key_change):
        """
        This is the callback invoked when keys related to jobs are changed. It ignores all events
        related to the creation of new jobs. Deletes or expirations cause checks to ensure they've
        been properly marked as completed.

        :param key_change: the event and value produced by a key changing in the orchestrator
        :type key_change: :class:`KeyChange`
        """
        if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
            return

        elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
            # Handle the expiration/deletion.
            job_metadata = json.loads(key_change.value)
            build_job = BuildJob(AttrDict(job_metadata["job_queue_item"]))
            logger.debug('Got "%s" of job %s', key_change.event,
                         build_job.build_uuid)

            # Get the build info.
            build_info = self._build_uuid_to_info.get(build_job.build_uuid,
                                                      None)
            if build_info is None:
                logger.debug(
                    'No build info for "%s" job %s (%s); probably already deleted by this manager',
                    key_change.event,
                    build_job.build_uuid,
                    job_metadata,
                )
                return

            if key_change.event != KeyEvent.EXPIRE:
                # If the etcd action was not an expiration, then it was already deleted by some manager and
                # the execution was therefore already shutdown. All that's left is to remove the build info.
                self._build_uuid_to_info.pop(build_job.build_uuid, None)
                return

            logger.debug("got expiration for job %s with metadata: %s",
                         build_job.build_uuid, job_metadata)

            if not job_metadata.get("had_heartbeat", False):
                # If we have not yet received a heartbeat, then the node failed to boot in some way.
                # We mark the job as incomplete here.
                await self._mark_job_incomplete(build_job, build_info)

            # Finally, we terminate the build execution for the job. We don't do this under a lock as
            # terminating a node is an atomic operation; better to make sure it is terminated than not.
            logger.debug(
                "Terminating expired build executor for job %s with execution id %s",
                build_job.build_uuid,
                build_info.execution_id,
            )
            await self.kill_builder_executor(build_job.build_uuid)
        else:
            logger.warning("Unexpected KeyEvent (%s) on job key: %s",
                           key_change.event, key_change.key)
Example #5
0
    def job_heartbeat(self, job_id):
        """Extend the processing time in the queue and updates the ttl of the job in the
        orchestrator.
        """
        try:
            job_data = self._orchestrator.get_key(job_id)
            job_data_json = json.loads(job_data)
            build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
        except KeyError:
            logger.warning(
                "Job %s no longer exists in the orchestrator, likely expired",
                job_id)
            return False
        except Exception as e:
            logger.error("Exception loading job %s from orchestrator: %s",
                         job_id, e)
            return False

        max_expiration = datetime.utcfromtimestamp(
            job_data_json["max_expiration"])
        max_expiration_remaining = max_expiration - datetime.utcnow()
        max_expiration_sec = max(1,
                                 int(max_expiration_remaining.total_seconds()))
        ttl = min(HEARTBEAT_PERIOD_SECONDS * 2, max_expiration_sec)

        # Update job expirations
        if (job_data_json["last_heartbeat"]
                and dateutil.parser.isoparse(job_data_json["last_heartbeat"]) <
                datetime.utcnow() - HEARTBEAT_DELTA):
            logger.warning(
                "Heartbeat expired for job %s. Marking job as expired. Last heartbeat received at %s",
                job_data_json["last_heartbeat"],
            )
            self.update_job_phase(job_id, BUILD_PHASE.INTERNAL_ERROR)
            return False

        job_data_json["last_heartbeat"] = str(datetime.utcnow())

        self._queue.extend_processing(
            build_job.job_item,
            seconds_from_now=JOB_TIMEOUT_SECONDS,
            minimum_extension=MINIMUM_JOB_EXTENSION,
        )

        try:
            self._orchestrator.set_key(job_id,
                                       json.dumps(job_data_json),
                                       overwrite=True,
                                       expiration=ttl)
        except OrchestratorConnectionError:
            logger.error(
                "Could not update heartbeat for job %s. Orchestrator is not available",
                job_id)
            return False

        return True
Example #6
0
    def update_job_phase(self, job_id, phase, phase_metadata=None):
        """Updates the given job's phase and append the phase change to the buildlogs, with the
        given phase metadata. If the job reaches a completed state, update_job_phase also update the
        queue and cleanups any existing state and executors.
        """
        try:
            job_data = self._orchestrator.get_key(job_id)
            job_data_json = json.loads(job_data)
            build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
        except KeyError:
            logger.warning(
                "Job %s no longer exists in the orchestrator, likely expired",
                job_id)
            return False
        except Exception as e:
            logger.error("Exception loading job %s from orchestrator: %s",
                         job_id, e)
            return False

        # Check if the build has not already reached a final phase
        if build_job.repo_build.phase in EphemeralBuilderManager.ARCHIVABLE_BUILD_PHASES:
            logger.warning(
                "Job %s is already in a final completed phase (%s), cannot update to %s",
                job_id,
                build_job.repo_build.phase,
                phase,
            )
            return False

        # Update the build phase
        phase_metadata = phase_metadata or {}
        updated = model.build.update_phase_then_close(build_job.build_uuid,
                                                      phase)
        if updated:
            self.append_log_message(build_job.build_uuid, phase,
                                    self._build_logs.PHASE, phase_metadata)

        # Check if on_job_complete needs to be called
        if updated and phase in EphemeralBuilderManager.COMPLETED_PHASES:
            executor_name = job_data_json.get("executor_name")
            execution_id = job_data_json.get("execution_id")

            if phase == BUILD_PHASE.ERROR:
                self.on_job_complete(build_job, BuildJobResult.ERROR,
                                     executor_name, execution_id)
            elif phase == BUILD_PHASE.COMPLETE:
                self.on_job_complete(build_job, BuildJobResult.COMPLETE,
                                     executor_name, execution_id)
            elif phase == BUILD_PHASE.INTERNAL_ERROR:
                self.on_job_complete(build_job, BuildJobResult.INCOMPLETE,
                                     executor_name, execution_id)
            elif phase == BUILD_PHASE.CANCELLED:
                self.on_job_complete(build_job, BuildJobResult.CANCELLED,
                                     executor_name, execution_id)

        return updated
Example #7
0
    def job_scheduled(self, job_id, control_plane, execution_id,
                      max_startup_time):
        """Mark the given job as scheduled with execution id, with max_startup_time.
        A job is considered scheduled once a worker is started with a given registration token.
        """
        # Get job to schedule
        try:
            job_data = self._orchestrator.get_key(job_id)
            job_data_json = json.loads(job_data)
        except KeyError:
            logger.warning(
                "Failed to mark job %s as scheduled. Job no longer exists in the orchestrator",
                job_id,
            )
            return False
        except Exception as e:
            logger.warning("Exception loading job %s from orchestrator: %s",
                           job_id, e)
            return False

        # Update build context
        job_data_json["executor_name"] = control_plane
        job_data_json["execution_id"] = execution_id
        try:
            self._orchestrator.set_key(job_id,
                                       json.dumps(job_data_json),
                                       overwrite=True,
                                       expiration=max_startup_time)
        except Exception as e:
            logger.warning("Exception updating job %s in orchestrator: %s",
                           job_id, e)
            return False

        build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
        updated = self.update_job_phase(job_id, BUILD_PHASE.BUILD_SCHEDULED)
        if updated:
            self._queue.extend_processing(
                build_job.job_item,
                seconds_from_now=max_startup_time +
                60,  # Add some leeway to allow the expiry event to complete
                minimum_extension=MINIMUM_JOB_EXTENSION,
            )

            logger.debug(
                "Job scheduled for job %s with execution with ID %s on control plane %s with max startup time of %s",
                job_id,
                execution_id,
                control_plane,
                max_startup_time,
            )
        else:
            logger.warning(
                "Job %s not scheduled. Unable update build phase to SCHEDULED",
                job_id)

        return updated
Example #8
0
    def _realm_callback(self, key_change):
        logger.debug("realm callback for key: %s", key_change.key)
        if key_change.event == KeyEvent.CREATE:
            # Listen on the realm created by ourselves or another worker.
            realm_spec = json.loads(key_change.value)
            self._register_realm(realm_spec)

        elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
            # Stop listening for new connections on the realm, if we did not get the connection.
            realm_spec = json.loads(key_change.value)
            realm_id = realm_spec["realm"]

            build_job = BuildJob(AttrDict(realm_spec["job_queue_item"]))
            build_uuid = build_job.build_uuid

            logger.debug("Realm key %s for build %s was %s", realm_id,
                         build_uuid, key_change.event)
            build_info = self._build_uuid_to_info.get(build_uuid, None)
            if build_info is not None:
                # Pop off the component and if we find one, then the build has not connected to this
                # manager, so we can safely unregister its component.
                component = self._component_to_job.pop(build_info.component,
                                                       None)
                if component is not None:
                    # We were not the manager which the worker connected to, remove the bookkeeping for it
                    logger.debug("Unregistering unused component for build %s",
                                 build_uuid)
                    self.unregister_component(build_info.component)

            # If the realm has expired, then perform cleanup of the executor.
            if key_change.event == KeyEvent.EXPIRE:
                execution_id = realm_spec.get("execution_id", None)
                executor_name = realm_spec.get("executor_name", "EC2Executor")

                # Cleanup the job, since it never started.
                logger.debug("Job %s for incomplete marking: %s", build_uuid,
                             build_info)
                if build_info is not None:
                    yield From(self._mark_job_incomplete(
                        build_job, build_info))

                # Cleanup the executor.
                logger.info(
                    "Realm %s expired for job %s, terminating executor %s with execution id %s",
                    realm_id,
                    build_uuid,
                    executor_name,
                    execution_id,
                )
                yield From(self.terminate_executor(executor_name,
                                                   execution_id))

        else:
            logger.warning("Unexpected action (%s) on realm key: %s",
                           key_change.event, key_change.key)
Example #9
0
    def _job_cancelled_callback(self, key_change):
        if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
            return

        job_metadata = json.loads(key_change.value)
        build_job = BuildJob(AttrDict(job_metadata["job_queue_item"]))
        executor_name = job_metadata.get("executor_name")
        execution_id = job_metadata.get("execution_id")

        job_result = BuildJobResult.CANCELLED
        self.on_job_complete(build_job, job_result, executor_name, execution_id)
Example #10
0
    def _build_job_from_job_id(self, job_id):
        """Return the BuildJob from the job id."""
        try:
            job_data = self._orchestrator.get_key(job_id)
        except KeyError:
            raise BuildJobDoesNotExistsError(job_id)
        except (OrchestratorConnectionError, OrchestratorError) as oe:
            raise BuildJobError(oe)

        job_metadata = json.loads(job_data)
        build_job = BuildJob(AttrDict(job_metadata["job_queue_item"]))
        return build_job
Example #11
0
    def _job_expired_callback(self, key_change):
        """Callback invoked when job key is changed, except for CREATE, SET events.
        DELETE and EXPIRE exvents make sure the build is marked as completed and remove any
        state tracking, executors left.
        """
        if key_change.event == KeyEvent.EXPIRE:
            job_metadata = json.loads(key_change.value)
            build_job = BuildJob(AttrDict(job_metadata["job_queue_item"]))
            executor_name = job_metadata.get("executor_name")
            execution_id = job_metadata.get("execution_id")

            job_result = BuildJobResult.EXPIRED

            model.build.update_phase_then_close(build_job.build_uuid,
                                                RESULT_PHASES[job_result])
            self.on_job_complete(build_job, job_result, executor_name,
                                 execution_id)
Example #12
0
    def cancel_build(self, build_id):
        build = model.build.get_repository_build(build_id)
        if build.phase in EphemeralBuilderManager.PHASES_NOT_ALLOWED_TO_CANCEL_FROM:
            return False

        cancelled = model.build.update_phase_then_close(build_id, BUILD_PHASE.CANCELLED)
        if cancelled:
            try:
                job_data = self._orchestrator.get_key(self._job_key(build_id))
                job_data_json = json.loads(job_data)
                build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
                self.on_job_complete(
                    build_job,
                    BuildJobResult.CANCELLED,
                    job_data_json.get("executor_name"),
                    job_data_json.get("execution_id"),
                )
            except KeyError:
                logger.warning(
                    "Could not cleanup cancelled job %s. Job does not exist in orchestrator", job_id
                )

        return cancelled
Example #13
0
    def _work_checker(self):
        logger.debug("Initializing work checker")
        while True:
            logger.debug("Writing queue metrics")
            self._queue.update_metrics()

            with database.CloseForLongOperation(app.config):
                time.sleep(WORK_CHECK_TIMEOUT)

            logger.debug("Checking for more work from the build queue")
            processing_time = EPHEMERAL_SETUP_TIMEOUT + SETUP_LEEWAY_SECONDS
            job_item = self._queue.get(processing_time=processing_time, ordering_required=True)

            if job_item is None:
                logger.debug(
                    "No additional work found. Going to sleep for %s seconds", WORK_CHECK_TIMEOUT
                )
                continue

            try:
                build_job = BuildJob(job_item)
            except BuildJobLoadException as bjle:
                logger.error(
                    "BuildJobLoadException. Job data: %s. No retry restore. - %s",
                    job_item.body,
                    bjle,
                )
                self._queue.incomplete(job_item, restore_retry=False)
                continue

            build_id = build_job.build_uuid
            job_id = self._job_key(build_id)

            try:
                logger.debug("Creating build job for build %s", build_id)
                self.create_job(build_id, {"job_queue_item": build_job.job_item})
            except BuildJobAlreadyExistsError:
                logger.warning(
                    "Attempted to create job %s that already exists. Cleaning up existing job and returning it to the queue.",
                    job_id,
                )
                self.job_unschedulable(job_id)
                self._queue.incomplete(job_item, restore_retry=True)
                continue
            except BuildJobError as je:
                logger.error("Create job exception. Build %s - %s", build_id, je)
                self._queue.incomplete(job_item, restore_retry=True)
                continue

            try:
                logger.debug("Scheduling build job %s", job_id)
                schedule_success, retry_timeout = self.schedule(build_id)
            except Exception as se:
                logger.exception("Exception when scheduling job %s: %s", build_job.build_uuid, se)
                self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
                continue

            if schedule_success:
                logger.debug("Build job %s scheduled.", job_id)
            else:
                logger.warning(
                    "Unsuccessful schedule. Build ID: %s. Retry restored.",
                    build_job.repo_build.uuid,
                )
                self.job_unschedulable(job_id)
                self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
Example #14
0
    def start_job(self, job_id, max_build_time):
        """Starts the build job. This is invoked by the worker once the job has been created and
        scheduled, returing the buildpack needed to start the actual build.
        """
        try:
            job_data = self._orchestrator.get_key(job_id)
            job_data_json = json.loads(job_data)
            build_job = BuildJob(AttrDict(job_data_json["job_queue_item"]))
        except KeyError:
            logger.warning("Failed to start job %s. Job does not exists in orchestrator", job_id)
            return None, None
        except Exception as e:
            logger.error("Exception loading job %s from orchestrator: %s", job_id, e)
            return None, None

        # Construct the buildpack
        repo = build_job.repo_build.repository
        repository_name = repo.namespace_user.username + "/" + repo.name
        context, dockerfile_path = build_job.extract_dockerfile_args()
        base_image_information = {}
        if build_job.pull_credentials:
            base_image_information["username"] = build_job.pull_credentials.get("username", "")
            base_image_information["password"] = build_job.pull_credentials.get("password", "")

        build_args = {
            "build_package": build_job.get_build_package_url(self._user_files),
            "context": context,
            "dockerfile_path": dockerfile_path,
            "repository": repository_name,
            "registry": self._registry_hostname,
            "pull_token": build_job.repo_build.access_token.get_code(),
            "push_token": build_job.repo_build.access_token.get_code(),
            "tag_names": build_job.build_config.get("docker_tags", ["latest"]),
            "base_image": base_image_information,
        }

        private_key = None
        if (
            build_job.repo_build.trigger is not None
            and build_job.repo_build.trigger.secure_private_key is not None
        ):
            private_key = build_job.repo_build.trigger.secure_private_key.decrypt()

        if private_key is not None:
            build_args["git"] = {
                "url": build_job.build_config["trigger_metadata"].get("git_url", ""),
                "sha": build_job.commit_sha(),
                "private_key": private_key or "",
            }

        # If the build args have no buildpack, mark it as a failure before sending
        # it to a builder instance.
        if not build_args["build_package"] and not build_args["git"]:
            logger.error(
                "Failed to start job %s: insufficient build args - No package url or git",
                job_id,
            )
            self.update_job_phase(job_id, BUILD_PHASE.INTERNAL_ERROR)
            return (None, None)

        # Generate the build token
        token = self.generate_build_token(
            BUILD_JOB_TOKEN_TYPE, build_job.build_uuid, job_id, max_build_time
        )

        # Publish the time it took for a worker to ack the build
        self._write_duration_metric(build_ack_duration, build_job.build_uuid)

        logger.debug("Started build job %s with arguments %s", job_id, build_args)
        return (token, build_args)
Example #15
0
    async def _work_checker(self):
        logger.debug("Initializing work checker")
        while self._current_status == BuildServerStatus.RUNNING:
            with database.CloseForLongOperation(app.config):
                await asyncio.sleep(WORK_CHECK_TIMEOUT)

            logger.debug(
                "Checking for more work for %d active workers",
                self._lifecycle_manager.num_workers(),
            )

            processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
            job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
            if job_item is None:
                logger.debug(
                    "No additional work found. Going to sleep for %s seconds", WORK_CHECK_TIMEOUT
                )
                continue

            try:
                build_job = BuildJob(job_item)
            except BuildJobLoadException as irbe:
                logger.warning(
                    "[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.",
                    job_item.body,
                )
                logger.exception(irbe)
                self._queue.incomplete(job_item, restore_retry=False)
                continue

            logger.debug(
                "Checking for an avaliable worker for build job %s", build_job.repo_build.uuid
            )

            try:
                schedule_success, retry_timeout = await self._lifecycle_manager.schedule(build_job)
            except:
                logger.warning(
                    "[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.",
                    build_job.repo_build.uuid,
                )
                logger.exception("Exception when scheduling job: %s", build_job.repo_build.uuid)
                self._current_status = BuildServerStatus.EXCEPTION
                self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
                return

            if schedule_success:
                logger.debug("Marking build %s as scheduled", build_job.repo_build.uuid)
                status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
                await status_handler.set_phase(database.BUILD_PHASE.BUILD_SCHEDULED)

                self._job_count = self._job_count + 1
                logger.debug(
                    "Build job %s scheduled. Running: %s",
                    build_job.repo_build.uuid,
                    self._job_count,
                )
            else:
                logger.warning(
                    "[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.",
                    build_job.repo_build.uuid,
                )
                logger.debug(
                    "All workers are busy for job %s Requeuing after %s seconds.",
                    build_job.repo_build.uuid,
                    retry_timeout,
                )
                self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)