def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) if not self.shared_file_system: # No dependencies if shared fs worker self.dependency_manager.release(run_state.bundle.uuid, dep_key) child_path = os.path.join(run_state.bundle_path, dep.child_path) try: remove_path(child_path) except Exception: logger.error(traceback.format_exc()) if run_state.is_restaged: return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: # No need to upload results since results are directly written to bundle store return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: return self.finalize_run(run_state)
def _transition_from_CLEANING_UP(self, run_state): """ 1- delete the container if still existent 2- clean up the dependencies from bundle directory 3- release the dependencies in dependency manager 4- If bundle has contents to upload (i.e. was RUNNING at some point), move to UPLOADING_RESULTS state Otherwise move to FINALIZING state """ def remove_path_no_fail(path): try: remove_path(path) except Exception: logger.error(traceback.format_exc()) if run_state.container_id is not None: while docker_utils.container_exists(run_state.container): try: finished, _, _ = docker_utils.check_finished( run_state.container) if finished: run_state.container.remove(force=True) run_state = run_state._replace(container=None, container_id=None) break else: try: run_state.container.kill() except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) except docker.errors.APIError: logger.error(traceback.format_exc()) time.sleep(1) for dep in run_state.bundle.dependencies: if not self.shared_file_system: # No dependencies if shared fs worker dep_key = DependencyKey(dep.parent_uuid, dep.parent_path) self.dependency_manager.release(run_state.bundle.uuid, dep_key) # Clean up dependencies paths for path in run_state.paths_to_remove or []: remove_path_no_fail(path) run_state = run_state._replace(paths_to_remove=[]) if run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.RESTAGED, reason=self.RESTAGED_REASON, ) return run_state._replace(stage=RunStage.RESTAGED) if not self.shared_file_system and run_state.has_contents: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.UPLOADING_RESULTS, ) return run_state._replace(stage=RunStage.UPLOADING_RESULTS, run_status='Uploading results', container=None) else: # No need to upload results since results are directly written to bundle store # Delete any files that match the exclude_patterns . for exclude_pattern in run_state.bundle.metadata[ "exclude_patterns"]: full_pattern = os.path.join(run_state.bundle_path, exclude_pattern) for file_path in glob.glob(full_pattern, recursive=True): # Only remove files that are subpaths of run_state.bundle_path, in case # that exclude_pattern is something like "../../../". if path_is_parent(parent_path=run_state.bundle_path, child_path=file_path): remove_path(file_path) return self.finalize_run(run_state)
def _transition_from_RUNNING(self, run_state): """ 1- Check run status of the docker container 2- If run is killed, kill the container 3- If run is finished, move to CLEANING_UP state """ def check_and_report_finished(run_state): try: finished, exitcode, failure_msg = docker_utils.check_finished( run_state.container) except docker_utils.DockerException: logger.error(traceback.format_exc()) finished, exitcode, failure_msg = False, None, None return run_state._replace(finished=finished, exitcode=exitcode, failure_message=failure_msg) def check_resource_utilization(run_state: RunState): logger.info( f'Checking resource utilization for bundle. uuid: {run_state.bundle.uuid}' ) cpu_usage, memory_usage = docker_utils.get_container_stats_with_docker_stats( run_state.container) run_state = run_state._replace(cpu_usage=cpu_usage, memory_usage=memory_usage) run_state = run_state._replace(memory_usage=memory_usage) kill_messages = [] run_stats = docker_utils.get_container_stats(run_state.container) run_state = run_state._replace(max_memory=max( run_state.max_memory, run_stats.get('memory', 0))) run_state = run_state._replace( disk_utilization=self.disk_utilization[ run_state.bundle.uuid]['disk_utilization']) container_time_total = docker_utils.get_container_running_time( run_state.container) run_state = run_state._replace( container_time_total=container_time_total, container_time_user=run_stats.get( 'container_time_user', run_state.container_time_user), container_time_system=run_stats.get( 'container_time_system', run_state.container_time_system), ) if run_state.resources.time and container_time_total > run_state.resources.time: kill_messages.append( 'Time limit exceeded. (Container uptime %s > time limit %s)' % (duration_str(container_time_total), duration_str(run_state.resources.time))) if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137: kill_messages.append('Memory limit %s exceeded.' % size_str(run_state.resources.memory)) if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk: kill_messages.append('Disk limit %sb exceeded.' % size_str(run_state.resources.disk)) if kill_messages: run_state = run_state._replace( kill_message=' '.join(kill_messages), is_killed=True) return run_state def check_disk_utilization(): logger.info( f'Checking disk utilization for bundle. uuid: {run_state.bundle.uuid}' ) running = True while running: start_time = time.time() try: disk_utilization = get_path_size(run_state.bundle_path) self.disk_utilization[run_state.bundle.uuid][ 'disk_utilization'] = disk_utilization running = self.disk_utilization[ run_state.bundle.uuid]['running'] except Exception: logger.error(traceback.format_exc()) end_time = time.time() # To ensure that we don't hammer the disk for this computation when # there are lots of files, we run it at most 10% of the time. time.sleep(max((end_time - start_time) * 10, 1.0)) self.disk_utilization.add_if_new( run_state.bundle.uuid, threading.Thread(target=check_disk_utilization, args=[])) run_state = check_and_report_finished(run_state) run_state = check_resource_utilization(run_state) if run_state.is_killed or run_state.is_restaged: log_bundle_transition( bundle_uuid=run_state.bundle.uuid, previous_stage=run_state.stage, next_stage=RunStage.CLEANING_UP, reason= f'the bundle was {"killed" if run_state.is_killed else "restaged"}', ) if docker_utils.container_exists(run_state.container): try: run_state.container.kill() except docker.errors.APIError: finished, _, _ = docker_utils.check_finished( run_state.container) if not finished: logger.error(traceback.format_exc()) self.disk_utilization[run_state.bundle.uuid]['running'] = False self.disk_utilization.remove(run_state.bundle.uuid) return run_state._replace(stage=RunStage.CLEANING_UP) if run_state.finished: logger.debug( 'Finished run with UUID %s, exitcode %s, failure_message %s', run_state.bundle.uuid, run_state.exitcode, run_state.failure_message, ) self.disk_utilization[run_state.bundle.uuid]['running'] = False self.disk_utilization.remove(run_state.bundle.uuid) return run_state._replace(stage=RunStage.CLEANING_UP, run_status='Uploading results') else: return run_state
def _transition_from_RUNNING(self, run_state): """ 1- Check run status of the docker container 2- If run is killed, kill the container 3- If run is finished, move to CLEANING_UP state """ def check_and_report_finished(run_state): try: finished, exitcode, failure_msg = docker_utils.check_finished(run_state.container) except docker_utils.DockerException: logger.error(traceback.format_exc()) finished, exitcode, failure_msg = False, None, None return run_state._replace( finished=finished, exitcode=exitcode, failure_message=failure_msg ) def check_resource_utilization(run_state): kill_messages = [] run_stats = docker_utils.get_container_stats(run_state.container) container_time_total = docker_utils.get_container_running_time(run_state.container) run_state = run_state._replace( container_time_total=container_time_total, container_time_user=run_stats.get( 'container_time_user', run_state.container_time_user ), container_time_system=run_stats.get( 'container_time_system', run_state.container_time_system ), ) run_state = run_state._replace( max_memory=max(run_state.max_memory, run_stats.get('memory', 0)) ) run_state = check_disk_utilization(run_state) if run_state.resources.time and container_time_total > run_state.resources.time: kill_messages.append( 'Time limit exceeded. (Container uptime %s > time limit %s)' % (duration_str(container_time_total), duration_str(run_state.resources.time)) ) if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137: kill_messages.append( 'Memory limit %s exceeded.' % size_str(run_state.resources.memory) ) if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk: kill_messages.append( 'Disk limit %sb exceeded.' % size_str(run_state.resources.disk) ) if kill_messages: run_state = run_state._replace(kill_message=' '.join(kill_messages), is_killed=True) return run_state def check_disk_utilization(run_state): try: disk_utilization = get_path_size(run_state.bundle_path) run_state._replace(disk_utilization=disk_utilization) except Exception: logger.error(traceback.format_exc()) return run_state run_state = check_and_report_finished(run_state) run_state = check_resource_utilization(run_state) if run_state.is_killed: if docker_utils.container_exists(run_state.container): try: run_state.container.kill() except docker.errors.APIError: finished, _, _ = docker_utils.check_finished(run_state.container) if not finished: logger.error(traceback.format_exc()) return run_state._replace(stage=RunStage.CLEANING_UP) if run_state.finished: logger.debug( 'Finished run with UUID %s, exitcode %s, failure_message %s', run_state.bundle.uuid, run_state.exitcode, run_state.failure_message, ) return run_state._replace(stage=RunStage.CLEANING_UP, run_status='Uploading results') else: return run_state