def handle_restarted_experiment(experiment): """If experiment is a restart, we should resume from last check point""" try: publisher.publish_log( log_line='Copying outputs from experiment `{}` into experiment `{}`' .format(experiment.original_experiment.unique_name, experiment.unique_name), status=ExperimentLifeCycle.BUILDING, experiment_uuid=experiment.uuid.hex, experiment_name=experiment.unique_name, job_uuid='all', persist=True) copy_experiment_outputs(experiment.original_experiment.unique_name, experiment.unique_name) except OSError: publisher.publish_log( log_line= 'Could not copy the outputs of experiment `{}` into experiment `{}`' .format(experiment.original_experiment.unique_name, experiment.unique_name), status=ExperimentLifeCycle.BUILDING, experiment_uuid=experiment.uuid.hex, experiment_name=experiment.unique_name, job_uuid='all', persist=True) logger.warning( 'Could not copy the outputs of experiment `{}` into experiment `{}`' .format(experiment.original_experiment.unique_name, experiment.unique_name))
def _handle_logs(self, log_line): publisher.publish_log( log_line=log_line, status=ExperimentLifeCycle.BUILDING, experiment_uuid=self.experiment_uuid, experiment_name=self.experiment_name, job_uuid='all', )
def _handle_logs(self, log_line): publisher.publish_log( log_line=log_line, status=ExperimentLifeCycle.BUILDING, experiment_uuid=self.experiment_uuid, experiment_name=self.experiment_name, job_uuid='all', persist=False # TODO: ADD log persistence )
def build(self, memory_limit=None): # Checkout to the correct commit git.checkout_commit(repo_path=self.repo_path, commit=self.image_tag) limits = { # Always disable memory swap for building, since mostly # nothing good can come of that. 'memswap': -1 } if memory_limit: limits['memory'] = memory_limit # Create DockerFile with open(self.dockerfile_path, 'w') as dockerfile: dockerfile.write(self.render()) self.connect() check_pulse = 0 for log_line in self.docker.build( path=self.build_path, tag='{}:{}'.format(self.image_name, self.image_tag), buildargs={}, decode=True, forcerm=True, rm=True, pull=True, nocache=False, container_limits=limits, stream=True, ): check_pulse += 1 publisher.publish_log( log_line=log_line, status=ExperimentLifeCycle.BUILDING, experiment_uuid=self.experiment_uuid, experiment_name=self.experiment_name, job_uuid='all', persist=False # TODO: ADD log persistence ) # Check if experiment is not stopped in the meanwhile if check_pulse > self.CHECK_INTERVAL: if not experiment_still_running(self.experiment_uuid): logger.info( 'Experiment `{}` is not running, stopping build'. format(self.experiment_uuid)) return False else: check_pulse = 0 # Checkout back to master git.checkout_commit(repo_path=self.repo_path) return True
def run(k8s_manager, pod_id, experiment_uuid, experiment_name, job_uuid, task_type, task_idx, container_job_name): raw = k8s_manager.k8s_api.read_namespaced_pod_log( pod_id, k8s_manager.namespace, container=container_job_name, follow=True, _preload_content=False) for log_line in raw.stream(): publisher.publish_log(log_line=log_line, status=ExperimentLifeCycle.RUNNING, experiment_uuid=experiment_uuid, experiment_name=experiment_name, job_uuid=job_uuid, task_type=task_type, task_idx=task_idx)
def push(self): # Build a progress setup for each layer, and only emit per-layer info every 1.5s layers = {} last_emit_time = time.time() self.connect() check_pulse = 0 for log_line in self.docker.push(self.image_name, tag=self.image_tag, stream=True): lines = [l for l in log_line.decode('utf-8').split('\r\n') if l] lines = [json.loads(l) for l in lines] for progress in lines: if 'error' in progress: logger.error(progress['error'], extra=dict(phase='failed')) return if 'id' not in progress: continue if 'progressDetail' in progress and progress['progressDetail']: layers[progress['id']] = progress['progressDetail'] else: layers[progress['id']] = progress['status'] if time.time() - last_emit_time > 1.5: logger.debug('Pushing image\n', extra=dict(progress=layers, phase='pushing')) last_emit_time = time.time() publisher.publish_log( log_line=log_line, status=ExperimentLifeCycle.BUILDING, experiment_uuid=self.experiment_uuid, experiment_name=self.experiment_name, job_uuid='all', persist=False # TODO: ADD log persistence ) # Check if experiment is not stopped in the meanwhile check_pulse += 1 if check_pulse > self.CHECK_INTERVAL: if not experiment_still_running(self.experiment_uuid): logger.info( 'Experiment `{}` is not running, stopping build'. format(self.experiment_uuid)) return False else: check_pulse = 0 return True