def start(self): """Return whether we ran anything.""" self.load_state() self.sync_state() self.image_manager.start() if not self.shared_file_system: self.dependency_manager.start() while not self.terminate: try: self.process_runs() self.save_state() self.checkin() self.check_termination() self.save_state() if self.check_idle_stop() or self.check_num_runs_stop(): self.terminate = True else: time.sleep(self.checkin_frequency_seconds) except Exception: self.last_checkin_successful = False if using_sentry(): capture_exception() traceback.print_exc() if self.exit_on_exception: logger.warning( 'Encountered exception, terminating the worker after sleeping for 5 minutes...' ) self.terminate = True # Sleep for 5 minutes time.sleep(5 * 60) else: # Sleep for a long time so we don't keep on failing. # We sleep in 5-second increments to check # if the worker needs to terminate (say, if it's received # a SIGTERM signal). logger.warning( 'Sleeping for 1 hour due to exception...please help me!' ) for _ in range(12 * 60): # We run this here, instead of going through another iteration of the # while loop, to minimize the code that's run---the reason we ended up here # in the first place is because of an exception, so we don't want to # re-trigger that exception. if self.terminate_and_restage: # If self.terminate_and_restage is true, self.check_termination() # restages bundles. We surround this in a try-except block, # so we can still properly terminate and clean up # even if self.check_termination() fails for some reason. try: self.check_termination() except Exception: traceback.print_exc() self.terminate = True if self.terminate: break time.sleep(5) self.cleanup()
def image_availability_state(image_spec, success_message, failure_message): """ Try to get the image specified by image_spec from host machine. Return ImageAvailabilityState. """ try: image = self._docker.images.get(image_spec) digests = image.attrs.get('RepoDigests', [image_spec]) digest = digests[0] if len(digests) > 0 else None new_timestamp = str(time.time()) image.tag(self.CACHE_TAG, tag=new_timestamp) for tag in image.tags: tag_label, timestamp = tag.split(":") # remove any other timestamp but not the current one if tag_label == self.CACHE_TAG and timestamp != new_timestamp: try: self._docker.images.remove(tag) except docker.errors.NotFound as err: # It's possible that we get a 404 not found error here when removing the image, # since another worker on the same system has already done so. We just # ignore this 404, since any extraneous tags will be removed during the next iteration. logger.warning( "Attempted to remove image %s from cache, but image was not found: %s", tag, err, ) return ImageAvailabilityState(digest=digest, stage=DependencyStage.READY, message=success_message) except Exception as ex: if using_sentry(): capture_exception() return ImageAvailabilityState(digest=None, stage=DependencyStage.FAILED, message=failure_message % ex)