def complete_task(self): if self._eval_job is None: return self._eval_job.complete_task() if self._eval_job.finished(): evaluation_metrics = ( self._eval_job.evaluation_metrics.get_evaluation_summary() ) if self._tensorboard_service and evaluation_metrics: self._tensorboard_service.write_dict_to_summary( evaluation_metrics, version=self._eval_job.model_version ) logger.info( "Evaluation metrics[v=%d]: %s" % ( self._eval_job.model_version if self._eval_job.model_version >= 0 else self._master_servicer.get_model_version(), str(evaluation_metrics), ) ) if not self._eval_only: # delete checkpoint file self._eval_job = None # create new eval job if possible self.try_to_create_new_job() return evaluation_metrics
def _start_worker(self, worker_id): logger.info("Starting worker: %d" % worker_id) bash_command = self._worker_args[1] bash_command += " --worker_id {}".format(worker_id) if self._ps_addrs: bash_command += " --ps_addrs {}".format(self._ps_addrs) if self._log_file_path: bash_command += BashCommandTemplate.REDIRECTION.format( self._log_file_path) for extra_arg in self._worker_args[2:]: bash_command += " {}".format(extra_arg) worker_args = [self._worker_args[0], bash_command] with self._lock: pod = self._k8s_client.create_worker( worker_id=worker_id, resource_requests=self._worker_resource_request, resource_limits=self._worker_resource_limit, pod_priority=self._worker_pod_priority[worker_id], termination_period=1, volume=self._volume, image_pull_policy=self._image_pull_policy, command=self._worker_command, args=worker_args, restart_policy=self._restart_policy, ps_addrs=self._ps_addrs, envs=copy.deepcopy(self._envs), ) if pod: name = pod.metadata.name self._worker_pod_name_to_id[name] = worker_id self._worker_pods_ip_phase[worker_id] = (name, None, None) return True else: self._not_created_worker_id.append(worker_id) return False
def monitor_status(self): retry_num = 0 job_succeed = False master_old_log = "" while True: master_pod = self.client.get_master_pod() if master_pod is None: retry_num += 1 if retry_num > MAX_READ_POD_RETRIES: logger.error("{} Not Found".format( self.client.get_master_pod_name())) break time.sleep(10) continue logger.info("Master status: {}".format(master_pod.status.phase)) if master_pod.status.phase == PodStatus.SUCCEEDED: job_succeed = True break elif master_pod.status.phase == PodStatus.PENDING: time.sleep(10) elif master_pod.status.phase == PodStatus.FAILED: log = self.client.get_master_log() print_tail_log(log, tail_num=100) logger.error("Job {} Failed".format(self.job_name)) break else: master_new_log = self.client.get_master_log() self.show_evaluation_and_task_log(master_new_log, master_old_log) master_old_log = master_new_log self.check_worker_status() self.check_ps_status() time.sleep(60) return job_succeed
def _process_eval_task(self, task): """ Check if there are evaluation tasks and process the tasks if any. Return: A python bool indicating whether worker processed some evaluation tasks. """ logger.info("the evaluation task_id: %d" % task.task_id) eval_info = self._task_data_service.get_validation_dataset(task) if not eval_info: return (eval_dataset, model_version, task_id) = eval_info eval_dataset = self._dataset_fn( eval_dataset, Mode.EVALUATION, self._task_data_service.data_reader.metadata, ) eval_dataset = eval_dataset.batch(self._minibatch_size).prefetch(1) err_msg = "" for dataset_batch in eval_dataset: data_err_msg = self._process_minibatch_and_report( dataset_batch, elasticdl_pb2.EVALUATION, model_version) if data_err_msg: err_msg = data_err_msg break del eval_dataset self.report_evaluation_metrics( model_outputs=self._evaluation_result[MetricsDictKey.MODEL_OUTPUT], labels=self._evaluation_result[MetricsDictKey.LABEL], ) self.report_task_result(task_id, err_msg) self._evaluation_result = {}
def monitor_status(self): retry_num = 0 pod_succeeded = False while True: try: pod = self.client.get_pod(self.pod_name) if pod is None: retry_num += 1 if retry_num > MAX_READ_POD_RETRIES: logger.error("{} Not Found".format(self.pod_name)) break time.sleep(10) continue retry_num = 0 logger.info("Pod Status : %s" % pod.status.phase) if pod.status.phase == PodStatus.SUCCEEDED: pod_succeeded = True break elif pod.status.phase == PodStatus.FAILED: logger.info(self.client.get_pod_log(self.pod_name)) break else: time.sleep(30) except client.api_client.ApiException: time.sleep(60) return pod_succeeded
def _gen(self): """ A generator supports the iter() protocol (e.g. a generator function), used to create a `tf.data.Dataset` object from a list of tasks. """ while True: # Make sure we also generate data from the warm-up task. if self._warm_up_task is not None and self._has_warmed_up: task = self._warm_up_task self._warm_up_task = None else: task = self._worker.get_task() if not task.shard_name: if task.type == elasticdl_pb2.WAIT: self._pending_dataset = True logger.info("No tasks for now, maybe more later") # There are too many requests to get task from the master # if the worker does not sleep. time.sleep(5) else: logger.info("No more task, stopping") break with self._lock: if task.type == elasticdl_pb2.TRAIN_END_CALLBACK: self._pending_train_end_callback_task = task continue self._pending_tasks.append(task) if len(self._pending_tasks) == 1: self._current_task = task for data in self.data_reader.read_records(task): if data: yield data
def _replace_attr_with_keras_embedding(model): """Replace the elasticdl.layers.Embedding attributes in the model with `tf.keras.layers.Embedding` or `SparseEmbedding` layers. """ for name, value in model.__dict__.items(): if type(value) == Embedding: # The combiner is not None only for SparseEmbedding, if value.combiner is not None: logger.info("Replace elasticdl with SparseEmbedding") embedding_layer = SparseEmbedding( output_dim=value.output_dim, input_dim=value.input_dim, embeddings_initializer=value.embeddings_initializer, combiner=value.combiner, ) else: logger.info("Replace elasticdl with ", "tf.kerasl.layers.Embedding") embedding_layer = tf.keras.layers.Embedding( output_dim=value.output_dim, input_dim=value.input_dim, embeddings_initializer=value.embeddings_initializer, mask_zero=value.mask_zero, input_length=value.input_length, ) setattr(model, name, embedding_layer) return model
def _event_cb(self, event): evt_obj = event.get("object") evt_type = event.get("type") if not evt_obj or not evt_type: logger.error("Event doesn't have object or type: %s" % event) return pod_name = evt_obj.metadata.name phase = evt_obj.status.phase logger.info("Got event %s, phase %s for pod: %s" % (evt_type, phase, pod_name)) relaunch = False with self._lock: worker_id = self._pod_name_to_id.get(pod_name) if (worker_id is None and pod_name != self._k8s_client.get_master_pod_name()): logger.error("Unknown worker pod name: %s" % pod_name) return self._pods_phase[worker_id] = (pod_name, phase) if evt_type == "DELETED": del self._pods_phase[worker_id] del self._pod_name_to_id[pod_name] self._task_d.recover_tasks(worker_id) # If a deleted pod was not "Succeeded", relaunch a worker. relaunch = (self._relaunch_deleted_live_worker and phase != "Succeeded") if relaunch: logger.info("Relaunching worker.") self._start_worker(self._next_worker_id())
def create_tasks(self, task_type, model_version=-1): logger.info( "Creating a new set of %s tasks for model version %d", elasticdl_pb2._TASKTYPE.values_by_number[task_type].name.lower(), model_version, ) if task_type == elasticdl_pb2.TRAINING: shards = self._training_shards elif task_type == elasticdl_pb2.EVALUATION: shards = self._evaluation_shards else: shards = self._prediction_shards tasks = [] # Note that a shard may contain records for multiple tasks. for ( shard_name, (start_ind_this_shard, num_records_this_shard), ) in shards.items(): max_ind_this_shard = start_ind_this_shard + num_records_this_shard for start_ind_this_task in range( start_ind_this_shard, start_ind_this_shard + num_records_this_shard, self._records_per_task, ): max_ind_this_task = ( start_ind_this_task + self._records_per_task ) end_ind_this_task = min( max_ind_this_task, num_records_this_shard ) # If the start index is not smaller than end index, # we need to find the correct end index by taking the start # index into account. We should not create task with # end index that exceeds the maximally possible number of # records available in this shard. if start_ind_this_task >= end_ind_this_task: end_ind_this_task = min( max_ind_this_task, start_ind_this_task + num_records_this_shard, max_ind_this_shard, ) # Note that only records in [start, end) of this task # will be consumed later in the worker that handles # this task. tasks.append( _Task( shard_name=shard_name, start=start_ind_this_task, end=end_ind_this_task, type=task_type, model_version=model_version, ) ) if task_type == elasticdl_pb2.TRAINING: random.shuffle(tasks) self._todo.extend(tasks) else: with self._lock: self._todo.extend(tasks) return tasks
def prepare(self): self.validate() # Composite the components if self.task_manager and self.pod_manager: self.task_manager.set_task_timeout_callback( self.pod_manager._remove_worker) if self.pod_manager: self._set_command_in_pod_manager() # Add PodEventCallbacks for the listeners of Pod events. if self.task_manager: self.pod_manager.add_pod_event_callback( TaskRescheduleCallback(self.task_manager)) if self.rendezvous_server: self.pod_manager.add_pod_event_callback( RendezvousServiceRefreshCallback(self.rendezvous_server)) if self._is_tfv1_ps_strategy_custom_training(): self.pod_manager.add_pod_event_callback( TFV1PSStrategyTrainLoopMonitorCallback(self)) # Start the components one by one if self.task_manager: self.task_manager.start() if self.rendezvous_server: self.rendezvous_server.start() if self.pod_manager: self.pod_manager.start() if self.elasticdl_job_service: self.elasticdl_job_service.start() # Start the master GRPC server logger.info("Starting master RPC server") self._master_server.start() logger.info("Master RPC server started")
def _start_ps(self, ps_id): logger.info("Starting PS: %d" % ps_id) bash_command = self._ps_args[1] bash_command += " --ps_id {}".format(ps_id) if self._log_file_path: bash_command += BashCommandTemplate.REDIRECTION.format( self._log_file_path) ps_args = [self._ps_args[0], bash_command] with self._lock: pod = self._k8s_client.create_ps( ps_id=ps_id, resource_requests=self._ps_resource_request, resource_limits=self._ps_resource_limit, pod_priority=self._ps_pod_priority, volume=self._volume, image_pull_policy=self._image_pull_policy, command=self._ps_command, args=ps_args, restart_policy=self._restart_policy, envs=copy.deepcopy(self._envs), expose_ports=False, ) name = pod.metadata.name self._ps_pod_name_to_id[name] = ps_id self._ps_pods_phase[ps_id] = (name, None) self._k8s_client.create_ps_service(ps_id)
def _gen(self): """ A generator supports the iter() protocol (e.g. a generator function), used to create a `tf.data.Dataset` from a list of tasks. """ while True: task = self._worker.get_task() if not task.shard_name: if task.type == elasticdl_pb2.WAIT: self._pending_dataset = True logger.info( "Finish current dataset, maybe more data later") else: logger.info("No more task, stopping") break with self._lock: if (self._training_with_evaluation and task.type == elasticdl_pb2.EVALUATION): self._pending_eval_tasks.append(task) continue self._record_count += task.end - task.start self._pending_tasks_with_counts.append( (task, self._record_count)) if len(self._pending_tasks_with_counts) == 1: self._current_task = task for data in self._data_reader.read_records(task): if data: yield data
def report(self, task_id, success): """Report if the task is successful or not""" evaluation_task_completed = False with self._lock: _, task = self._doing.pop(task_id, (-1, None)) if not task: logger.warning("Unknown task_id: %d" % task_id) elif not success: # TODO: keep count of retries. if task.type == elasticdl_pb2.TRAINING: self._todo.append(task) else: self._eval_todo.append(task) elif (task.type == elasticdl_pb2.EVALUATION and self._evaluation_service is not None): evaluation_task_completed = True else: logger.info( "Task:%d completed, %d remaining tasks", task_id, len(self._todo) + len(self._doing), ) if evaluation_task_completed: self._evaluation_service.complete_task()
def _clone_function(layer): if type(layer) == Embedding: logger.info("Replace embedding layer with " "elasticdl.layers.Embedding") # The combiner is not None only for SparseEmbedding, if layer.combiner is not None: embedding_layer = SparseEmbedding( output_dim=layer.output_dim, input_dim=layer.input_dim, embeddings_initializer=layer.embeddings_initializer, name=layer.name, combiner=layer.combiner, ) else: embedding_layer = tf.keras.layers.Embedding( output_dim=layer.output_dim, input_dim=layer.input_dim, embeddings_initializer=layer.embeddings_initializer, mask_zero=layer.mask_zero, input_length=layer.input_length, name=layer.name, ) return embedding_layer elif type(layer) == tf.keras.layers.DenseFeatures: return _replace_edl_embedding_column_with_tf(layer) return layer
def run(self): """Execute the training loop""" epoch = 0 step = 0 train_tasks = self._gen_tasks(self.training_data) validation_tasks = self._gen_tasks(self.validation_data) train_dataset = self._get_dataset(train_tasks) validation_dataset = self._get_dataset(validation_tasks) while epoch < self.epoch: for features, labels in train_dataset: loss = self._train(features, labels) logger.info("step {}, Loss = {}".format(step, loss)) step += 1 if ( self.evaluation_steps > 0 and step % self.evaluation_steps == 0 ): self._evaluate(validation_dataset) self._evaluate(validation_dataset) logger.info("Epoch {} end".format(epoch)) epoch += 1 if self.save_model_dir != "": tf.saved_model.save(self.model_inst, self.save_model_dir)
def report(self, request, success): """Report if the task is successful or not""" task_id = request.task_id evaluation_task_completed = False with self._lock: _, task = self._doing.pop(task_id, (-1, None)) if task: self._job_counters[ task.type].failed_records += request.exec_counters.get( TaskExecCounterKey.FAIL_COUNT, 0) if not task: logger.warning("Unknown task_id: %d" % task_id) elif not success: # TODO: keep count of retries. if task.type == elasticdl_pb2.TRAINING: self._todo.append(task) else: self._eval_todo.append(task) elif (task.type == elasticdl_pb2.EVALUATION and self._evaluation_service is not None): evaluation_task_completed = True else: logger.info( "Task:%d completed, %d remaining tasks", task_id, len(self._todo) + len(self._doing), ) if evaluation_task_completed: self._evaluation_service.complete_task()
def process(self, predictions, worker_id): if self.odps_writer: self.odps_writer.from_iterator( iter(predictions.numpy().tolist()), worker_id ) else: logger.info(predictions.numpy())
def _process_save_model_task_if_needed(self): ( task, dataset, ) = self._task_data_service.get_save_model_task_and_dataset() if task is not None and dataset is not None: dataset = self._dataset_fn( dataset, Mode.PREDICTION, self._task_data_service.data_reader.metadata, ) dataset = dataset.batch(self._minibatch_size) saved_model_path = task.extended_config.get( SaveModelConfig.SAVED_MODEL_PATH ) saved_model_path = os.path.join( saved_model_path, str(int(time.time())) ) logger.info( "The path to export model is {}".format(saved_model_path) ) model = self._model_handler.get_model_to_export( self._model, dataset ) tf.saved_model.save(model, saved_model_path) self.report_task_result(task_id=task.task_id, err_msg="")
def create_master(self, **kargs): env = [ V1EnvVar( name="MY_POD_IP", value_from=V1EnvVarSource(field_ref=V1ObjectFieldSelector( field_path="status.podIP")), ) ] if "envs" in kargs: for key in kargs["envs"]: env.append(V1EnvVar(name=key, value=kargs["envs"][key])) pod = self._create_pod( pod_name=self.get_master_pod_name(), job_name=self.job_name, image_name=self._image_name, command=["python"], resource_requests=kargs["resource_requests"], resource_limits=kargs["resource_limits"], container_args=kargs["args"], pod_priority=kargs["pod_priority"], image_pull_policy=kargs["image_pull_policy"], restart_policy=kargs["restart_policy"], volume=kargs["volume"], owner_pod=None, env=env, ) # Add replica type and index pod.metadata.labels[ELASTICDL_REPLICA_TYPE_KEY] = "master" pod.metadata.labels[ELASTICDL_REPLICA_INDEX_KEY] = "0" self.client.create_namespaced_pod(self.namespace, pod) logger.info("Master launched.")
def _gen(self): """ A generator supports the iter() protocol (e.g. a generator function), used to create a `tf.data.Dataset` object from a list of tasks. """ while True: # Make sure we also generate data from the warm-up task. if self._warm_up_task is not None and self._has_warmed_up: task = self._warm_up_task self._warm_up_task = None else: task = self._worker.get_task() if not task.shard_name: if task.type == elasticdl_pb2.WAIT: self._pending_dataset = True logger.info( "Finish current dataset, maybe more data later" ) else: logger.info("No more task, stopping") break with self._lock: if task.type == elasticdl_pb2.SAVE_MODEL: self._pending_save_model_task = task continue self._pending_tasks.append(task) if len(self._pending_tasks) == 1: self._current_task = task for data in self.data_reader.read_records(task): if data: yield data
def delete_master(self): logger.info("pod name is %s" % self.get_master_pod_name()) self.client.delete_namespaced_pod( self.get_master_pod_name(), self.namespace, body=client.V1DeleteOptions(grace_period_seconds=0), )
def _start_worker(self, worker_id): logger.info("Starting worker: %d" % worker_id) bash_command = self._worker_args[1] bash_command += " --worker_id {}".format(worker_id) bash_command += " --ps_addrs {}".format(self._ps_addrs) if self._log_file_path: bash_command += BashCommandTemplate.REDIRECTION.format( self._log_file_path) worker_args = [self._worker_args[0], bash_command] with self._lock: pod = self._k8s_client.create_worker( worker_id=worker_id, resource_requests=self._worker_resource_request, resource_limits=self._worker_resource_limit, pod_priority=self._worker_pod_priority[worker_id], termination_period=1, volume=self._volume, image_pull_policy=self._image_pull_policy, command=self._worker_command, args=worker_args, restart_policy=self._restart_policy, ps_addrs=self._ps_addrs, envs=copy.deepcopy(self._envs), expose_ports=self._expose_ports, ) name = pod.metadata.name self._worker_pod_name_to_id[name] = worker_id self._worker_pods_phase[worker_id] = (name, None) self._k8s_client.create_worker_service(worker_id)
def _replace_attr_with_edl_embedding(model): """Replace the keras embedding attributes in the model with `elasticdl.layers.Embedding` layers. """ for name, value in model.__dict__.items(): if type(value) == tf.keras.layers.Embedding: logger.info("Replace {} layer with " "elasticdl.layers.Embedding".format(value)) initializer_name = tf.keras.initializers.serialize( value.embeddings_initializer)["class_name"] embedding_layer = Embedding( output_dim=value.output_dim, input_dim=value.input_dim, embeddings_initializer=initializer_name, mask_zero=value.mask_zero, input_length=value.input_length, ) setattr(model, name, embedding_layer) elif type(value) == SparseEmbedding: logger.info("Replace {} layer with " "elasticdl.layers.Embedding".format(value)) embedding_layer = Embedding( output_dim=value.output_dim, input_dim=value.input_dim, embeddings_initializer=initializer_name, combiner=value.combiner, ) setattr(model, name, embedding_layer) return model
def start_redis_service(self): args = self._parse_embedding_service_args() logger.info("Starting redis server on ports: %d - %d, " "--cluster_node_timeout %d" % ( args.first_port, args.first_port + args.num_of_redis_instances - 1, args.cluster_node_timeout, )) failed_port = [] for i in range(args.num_of_redis_instances): port = args.first_port + i command = ( "redis-server --port %d --cluster-enabled yes " "--cluster-config-file nodes-%d.conf --cluster-node-timeout" " %d --appendonly yes --appendfilename appendonly-%d.aof " "--dbfilename dump-%d.rdb --logfile %d.log --daemonize yes " "--protected-mode no" % (port, port, args.cluster_node_timeout, port, port, port)) return_code = self._run_shell_command(command) if return_code: failed_port.append(port) if failed_port: local_ip = os.getenv("MY_POD_IP", "localhost") logger.info("%s starts these redis instances failed: %s" % (local_ip, ";".join(map(str, failed_port))))
def create_master_service( port, task_manager, pod_manager, rendezvous_server, evaluation_service, ): """Create GRPC server """ logger.info("Creating master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( task_manager=task_manager, instance_manager=pod_manager, rendezvous_server=rendezvous_server, evaluation_service=evaluation_service, ) elasticai_api_pb2_grpc.add_MasterServicer_to_server( master_servicer, server) elasticdl_pb2_grpc.add_TrainLoopMasterServicer_to_server( master_servicer, server) server.add_insecure_port("[::]:{}".format(port)) logger.info("The port of the master server is: %d", port) return server
def print_args(args, groups=None): """ Args: args: parsing results returned from `parser.parse_args` groups: It is a list of a list. It controls which options should be printed together. For example, we expect all model specifications such as `optimizer`, `loss` are better printed together. groups = [["optimizer", "loss"]] """ def _get_attr(instance, attribute): try: return getattr(instance, attribute) except AttributeError: return None dedup = set() if groups: for group in groups: for element in group: dedup.add(element) logger.info("%s = %s", element, _get_attr(args, element)) other_options = [(key, value) for (key, value) in args.__dict__.items() if key not in dedup] for key, value in other_options: logger.info("%s = %s", key, value)
def request_stop(self, success, msg=""): self._stop_requested = True if success: self._exit_code = 0 logger.info(msg) else: self._exit_code = 1 logger.error(msg)
def _remove_ps(self, ps_id): logger.info("Removing PS: %d", ps_id) with self._lock: if ps_id not in self._ps_pods_phase: logger.error("Unknown PS id: %s" % ps_id) return self._k8s_client.delete_ps(ps_id)
def start_tensorboard_service(self): self._k8s_client.create_tensorboard_service() logger.info("Waiting for the URL for TensorBoard service...") tb_url = self._get_tensorboard_url() if tb_url: logger.info("TensorBoard service is available at: %s" % tb_url) else: logger.warning("Unable to get the URL for TensorBoard service")
def _remove_worker(self, worker_id): logger.info("Removing worker: %d", worker_id) with self._lock: if worker_id not in self._worker_pods_phase: logger.error("Unknown worker id: %s" % worker_id) return # TODO: change _k8s_client to accept pod name instead of worker id. self._k8s_client.delete_worker(worker_id)