Exemple #1
0
 def complete_task(self):
     if self._eval_job is None:
         return
     self._eval_job.complete_task()
     if self._eval_job.finished():
         evaluation_metrics = (
             self._eval_job.evaluation_metrics.get_evaluation_summary()
         )
         if self._tensorboard_service and evaluation_metrics:
             self._tensorboard_service.write_dict_to_summary(
                 evaluation_metrics, version=self._eval_job.model_version
             )
         logger.info(
             "Evaluation metrics[v=%d]: %s"
             % (
                 self._eval_job.model_version
                 if self._eval_job.model_version >= 0
                 else self._master_servicer.get_model_version(),
                 str(evaluation_metrics),
             )
         )
         if not self._eval_only:
             # delete checkpoint file
             self._eval_job = None
             # create new eval job if possible
             self.try_to_create_new_job()
         return evaluation_metrics
Exemple #2
0
 def _start_worker(self, worker_id):
     logger.info("Starting worker: %d" % worker_id)
     bash_command = self._worker_args[1]
     bash_command += " --worker_id {}".format(worker_id)
     if self._ps_addrs:
         bash_command += " --ps_addrs {}".format(self._ps_addrs)
     if self._log_file_path:
         bash_command += BashCommandTemplate.REDIRECTION.format(
             self._log_file_path)
     for extra_arg in self._worker_args[2:]:
         bash_command += " {}".format(extra_arg)
     worker_args = [self._worker_args[0], bash_command]
     with self._lock:
         pod = self._k8s_client.create_worker(
             worker_id=worker_id,
             resource_requests=self._worker_resource_request,
             resource_limits=self._worker_resource_limit,
             pod_priority=self._worker_pod_priority[worker_id],
             termination_period=1,
             volume=self._volume,
             image_pull_policy=self._image_pull_policy,
             command=self._worker_command,
             args=worker_args,
             restart_policy=self._restart_policy,
             ps_addrs=self._ps_addrs,
             envs=copy.deepcopy(self._envs),
         )
         if pod:
             name = pod.metadata.name
             self._worker_pod_name_to_id[name] = worker_id
             self._worker_pods_ip_phase[worker_id] = (name, None, None)
             return True
         else:
             self._not_created_worker_id.append(worker_id)
             return False
    def monitor_status(self):
        retry_num = 0
        job_succeed = False
        master_old_log = ""
        while True:
            master_pod = self.client.get_master_pod()
            if master_pod is None:
                retry_num += 1
                if retry_num > MAX_READ_POD_RETRIES:
                    logger.error("{} Not Found".format(
                        self.client.get_master_pod_name()))
                    break
                time.sleep(10)
                continue

            logger.info("Master status: {}".format(master_pod.status.phase))
            if master_pod.status.phase == PodStatus.SUCCEEDED:
                job_succeed = True
                break
            elif master_pod.status.phase == PodStatus.PENDING:
                time.sleep(10)
            elif master_pod.status.phase == PodStatus.FAILED:
                log = self.client.get_master_log()
                print_tail_log(log, tail_num=100)
                logger.error("Job {} Failed".format(self.job_name))
                break
            else:
                master_new_log = self.client.get_master_log()
                self.show_evaluation_and_task_log(master_new_log,
                                                  master_old_log)
                master_old_log = master_new_log
                self.check_worker_status()
                self.check_ps_status()
                time.sleep(60)
        return job_succeed
Exemple #4
0
 def _process_eval_task(self, task):
     """
     Check if there are evaluation tasks and process the tasks if any.
     Return:
         A python bool indicating whether worker processed some evaluation
         tasks.
     """
     logger.info("the evaluation task_id: %d" % task.task_id)
     eval_info = self._task_data_service.get_validation_dataset(task)
     if not eval_info:
         return
     (eval_dataset, model_version, task_id) = eval_info
     eval_dataset = self._dataset_fn(
         eval_dataset,
         Mode.EVALUATION,
         self._task_data_service.data_reader.metadata,
     )
     eval_dataset = eval_dataset.batch(self._minibatch_size).prefetch(1)
     err_msg = ""
     for dataset_batch in eval_dataset:
         data_err_msg = self._process_minibatch_and_report(
             dataset_batch, elasticdl_pb2.EVALUATION, model_version)
         if data_err_msg:
             err_msg = data_err_msg
             break
     del eval_dataset
     self.report_evaluation_metrics(
         model_outputs=self._evaluation_result[MetricsDictKey.MODEL_OUTPUT],
         labels=self._evaluation_result[MetricsDictKey.LABEL],
     )
     self.report_task_result(task_id, err_msg)
     self._evaluation_result = {}
    def monitor_status(self):
        retry_num = 0
        pod_succeeded = False

        while True:
            try:
                pod = self.client.get_pod(self.pod_name)
                if pod is None:
                    retry_num += 1
                    if retry_num > MAX_READ_POD_RETRIES:
                        logger.error("{} Not Found".format(self.pod_name))
                        break
                    time.sleep(10)
                    continue

                retry_num = 0

                logger.info("Pod Status : %s" % pod.status.phase)
                if pod.status.phase == PodStatus.SUCCEEDED:
                    pod_succeeded = True
                    break
                elif pod.status.phase == PodStatus.FAILED:
                    logger.info(self.client.get_pod_log(self.pod_name))
                    break
                else:
                    time.sleep(30)
            except client.api_client.ApiException:
                time.sleep(60)
        return pod_succeeded
Exemple #6
0
    def _gen(self):
        """
        A generator supports the iter() protocol (e.g. a generator function),
        used to create a `tf.data.Dataset` object from a list of tasks.
        """
        while True:
            # Make sure we also generate data from the warm-up task.
            if self._warm_up_task is not None and self._has_warmed_up:
                task = self._warm_up_task
                self._warm_up_task = None
            else:
                task = self._worker.get_task()
            if not task.shard_name:
                if task.type == elasticdl_pb2.WAIT:
                    self._pending_dataset = True
                    logger.info("No tasks for now, maybe more later")
                    # There are too many requests to get task from the master
                    # if the worker does not sleep.
                    time.sleep(5)
                else:
                    logger.info("No more task, stopping")
                break
            with self._lock:
                if task.type == elasticdl_pb2.TRAIN_END_CALLBACK:
                    self._pending_train_end_callback_task = task
                    continue

                self._pending_tasks.append(task)
                if len(self._pending_tasks) == 1:
                    self._current_task = task
            for data in self.data_reader.read_records(task):
                if data:
                    yield data
Exemple #7
0
 def _replace_attr_with_keras_embedding(model):
     """Replace the elasticdl.layers.Embedding attributes in the model
     with `tf.keras.layers.Embedding` or `SparseEmbedding` layers.
     """
     for name, value in model.__dict__.items():
         if type(value) == Embedding:
             # The combiner is not None only for SparseEmbedding,
             if value.combiner is not None:
                 logger.info("Replace elasticdl with SparseEmbedding")
                 embedding_layer = SparseEmbedding(
                     output_dim=value.output_dim,
                     input_dim=value.input_dim,
                     embeddings_initializer=value.embeddings_initializer,
                     combiner=value.combiner,
                 )
             else:
                 logger.info("Replace elasticdl with ",
                             "tf.kerasl.layers.Embedding")
                 embedding_layer = tf.keras.layers.Embedding(
                     output_dim=value.output_dim,
                     input_dim=value.input_dim,
                     embeddings_initializer=value.embeddings_initializer,
                     mask_zero=value.mask_zero,
                     input_length=value.input_length,
                 )
             setattr(model, name, embedding_layer)
     return model
    def _event_cb(self, event):
        evt_obj = event.get("object")
        evt_type = event.get("type")
        if not evt_obj or not evt_type:
            logger.error("Event doesn't have object or type: %s" % event)
            return

        pod_name = evt_obj.metadata.name
        phase = evt_obj.status.phase
        logger.info("Got event %s, phase %s for pod: %s" %
                    (evt_type, phase, pod_name))

        relaunch = False
        with self._lock:
            worker_id = self._pod_name_to_id.get(pod_name)
            if (worker_id is None
                    and pod_name != self._k8s_client.get_master_pod_name()):
                logger.error("Unknown worker pod name: %s" % pod_name)
                return

            self._pods_phase[worker_id] = (pod_name, phase)
            if evt_type == "DELETED":
                del self._pods_phase[worker_id]
                del self._pod_name_to_id[pod_name]
                self._task_d.recover_tasks(worker_id)

                # If a deleted pod was not "Succeeded", relaunch a worker.
                relaunch = (self._relaunch_deleted_live_worker
                            and phase != "Succeeded")
        if relaunch:
            logger.info("Relaunching worker.")
            self._start_worker(self._next_worker_id())
Exemple #9
0
 def create_tasks(self, task_type, model_version=-1):
     logger.info(
         "Creating a new set of %s tasks for model version %d",
         elasticdl_pb2._TASKTYPE.values_by_number[task_type].name.lower(),
         model_version,
     )
     if task_type == elasticdl_pb2.TRAINING:
         shards = self._training_shards
     elif task_type == elasticdl_pb2.EVALUATION:
         shards = self._evaluation_shards
     else:
         shards = self._prediction_shards
     tasks = []
     # Note that a shard may contain records for multiple tasks.
     for (
         shard_name,
         (start_ind_this_shard, num_records_this_shard),
     ) in shards.items():
         max_ind_this_shard = start_ind_this_shard + num_records_this_shard
         for start_ind_this_task in range(
             start_ind_this_shard,
             start_ind_this_shard + num_records_this_shard,
             self._records_per_task,
         ):
             max_ind_this_task = (
                 start_ind_this_task + self._records_per_task
             )
             end_ind_this_task = min(
                 max_ind_this_task, num_records_this_shard
             )
             # If the start index is not smaller than end index,
             # we need to find the correct end index by taking the start
             # index into account. We should not create task with
             # end index that exceeds the maximally possible number of
             # records available in this shard.
             if start_ind_this_task >= end_ind_this_task:
                 end_ind_this_task = min(
                     max_ind_this_task,
                     start_ind_this_task + num_records_this_shard,
                     max_ind_this_shard,
                 )
             # Note that only records in [start, end) of this task
             # will be consumed later in the worker that handles
             # this task.
             tasks.append(
                 _Task(
                     shard_name=shard_name,
                     start=start_ind_this_task,
                     end=end_ind_this_task,
                     type=task_type,
                     model_version=model_version,
                 )
             )
     if task_type == elasticdl_pb2.TRAINING:
         random.shuffle(tasks)
         self._todo.extend(tasks)
     else:
         with self._lock:
             self._todo.extend(tasks)
     return tasks
Exemple #10
0
    def prepare(self):
        self.validate()
        # Composite the components
        if self.task_manager and self.pod_manager:
            self.task_manager.set_task_timeout_callback(
                self.pod_manager._remove_worker)
        if self.pod_manager:
            self._set_command_in_pod_manager()
            # Add PodEventCallbacks for the listeners of Pod events.
            if self.task_manager:
                self.pod_manager.add_pod_event_callback(
                    TaskRescheduleCallback(self.task_manager))
            if self.rendezvous_server:
                self.pod_manager.add_pod_event_callback(
                    RendezvousServiceRefreshCallback(self.rendezvous_server))
            if self._is_tfv1_ps_strategy_custom_training():
                self.pod_manager.add_pod_event_callback(
                    TFV1PSStrategyTrainLoopMonitorCallback(self))

        # Start the components one by one
        if self.task_manager:
            self.task_manager.start()
        if self.rendezvous_server:
            self.rendezvous_server.start()
        if self.pod_manager:
            self.pod_manager.start()
        if self.elasticdl_job_service:
            self.elasticdl_job_service.start()

        # Start the master GRPC server
        logger.info("Starting master RPC server")
        self._master_server.start()
        logger.info("Master RPC server started")
Exemple #11
0
 def _start_ps(self, ps_id):
     logger.info("Starting PS: %d" % ps_id)
     bash_command = self._ps_args[1]
     bash_command += " --ps_id {}".format(ps_id)
     if self._log_file_path:
         bash_command += BashCommandTemplate.REDIRECTION.format(
             self._log_file_path)
     ps_args = [self._ps_args[0], bash_command]
     with self._lock:
         pod = self._k8s_client.create_ps(
             ps_id=ps_id,
             resource_requests=self._ps_resource_request,
             resource_limits=self._ps_resource_limit,
             pod_priority=self._ps_pod_priority,
             volume=self._volume,
             image_pull_policy=self._image_pull_policy,
             command=self._ps_command,
             args=ps_args,
             restart_policy=self._restart_policy,
             envs=copy.deepcopy(self._envs),
             expose_ports=False,
         )
         name = pod.metadata.name
         self._ps_pod_name_to_id[name] = ps_id
         self._ps_pods_phase[ps_id] = (name, None)
         self._k8s_client.create_ps_service(ps_id)
Exemple #12
0
 def _gen(self):
     """
     A generator supports the iter() protocol (e.g. a generator function),
     used to create a `tf.data.Dataset` from a list of tasks.
     """
     while True:
         task = self._worker.get_task()
         if not task.shard_name:
             if task.type == elasticdl_pb2.WAIT:
                 self._pending_dataset = True
                 logger.info(
                     "Finish current dataset, maybe more data later")
             else:
                 logger.info("No more task, stopping")
             break
         with self._lock:
             if (self._training_with_evaluation
                     and task.type == elasticdl_pb2.EVALUATION):
                 self._pending_eval_tasks.append(task)
                 continue
             self._record_count += task.end - task.start
             self._pending_tasks_with_counts.append(
                 (task, self._record_count))
             if len(self._pending_tasks_with_counts) == 1:
                 self._current_task = task
         for data in self._data_reader.read_records(task):
             if data:
                 yield data
Exemple #13
0
    def report(self, task_id, success):
        """Report if the task is successful or not"""

        evaluation_task_completed = False
        with self._lock:
            _, task = self._doing.pop(task_id, (-1, None))
            if not task:
                logger.warning("Unknown task_id: %d" % task_id)
            elif not success:
                # TODO: keep count of retries.
                if task.type == elasticdl_pb2.TRAINING:
                    self._todo.append(task)
                else:
                    self._eval_todo.append(task)
            elif (task.type == elasticdl_pb2.EVALUATION
                  and self._evaluation_service is not None):
                evaluation_task_completed = True
            else:
                logger.info(
                    "Task:%d completed, %d remaining tasks",
                    task_id,
                    len(self._todo) + len(self._doing),
                )
        if evaluation_task_completed:
            self._evaluation_service.complete_task()
Exemple #14
0
 def _clone_function(layer):
     if type(layer) == Embedding:
         logger.info("Replace embedding layer with "
                     "elasticdl.layers.Embedding")
         # The combiner is not None only for SparseEmbedding,
         if layer.combiner is not None:
             embedding_layer = SparseEmbedding(
                 output_dim=layer.output_dim,
                 input_dim=layer.input_dim,
                 embeddings_initializer=layer.embeddings_initializer,
                 name=layer.name,
                 combiner=layer.combiner,
             )
         else:
             embedding_layer = tf.keras.layers.Embedding(
                 output_dim=layer.output_dim,
                 input_dim=layer.input_dim,
                 embeddings_initializer=layer.embeddings_initializer,
                 mask_zero=layer.mask_zero,
                 input_length=layer.input_length,
                 name=layer.name,
             )
         return embedding_layer
     elif type(layer) == tf.keras.layers.DenseFeatures:
         return _replace_edl_embedding_column_with_tf(layer)
     return layer
    def run(self):
        """Execute the training loop"""
        epoch = 0
        step = 0

        train_tasks = self._gen_tasks(self.training_data)
        validation_tasks = self._gen_tasks(self.validation_data)
        train_dataset = self._get_dataset(train_tasks)
        validation_dataset = self._get_dataset(validation_tasks)

        while epoch < self.epoch:
            for features, labels in train_dataset:
                loss = self._train(features, labels)
                logger.info("step {}, Loss = {}".format(step, loss))
                step += 1
                if (
                    self.evaluation_steps > 0
                    and step % self.evaluation_steps == 0
                ):
                    self._evaluate(validation_dataset)
            self._evaluate(validation_dataset)
            logger.info("Epoch {} end".format(epoch))
            epoch += 1
        if self.save_model_dir != "":
            tf.saved_model.save(self.model_inst, self.save_model_dir)
Exemple #16
0
    def report(self, request, success):
        """Report if the task is successful or not"""

        task_id = request.task_id
        evaluation_task_completed = False
        with self._lock:
            _, task = self._doing.pop(task_id, (-1, None))
            if task:
                self._job_counters[
                    task.type].failed_records += request.exec_counters.get(
                        TaskExecCounterKey.FAIL_COUNT, 0)
            if not task:
                logger.warning("Unknown task_id: %d" % task_id)
            elif not success:
                # TODO: keep count of retries.
                if task.type == elasticdl_pb2.TRAINING:
                    self._todo.append(task)
                else:
                    self._eval_todo.append(task)
            elif (task.type == elasticdl_pb2.EVALUATION
                  and self._evaluation_service is not None):
                evaluation_task_completed = True
            else:
                logger.info(
                    "Task:%d completed, %d remaining tasks",
                    task_id,
                    len(self._todo) + len(self._doing),
                )
        if evaluation_task_completed:
            self._evaluation_service.complete_task()
 def process(self, predictions, worker_id):
     if self.odps_writer:
         self.odps_writer.from_iterator(
             iter(predictions.numpy().tolist()), worker_id
         )
     else:
         logger.info(predictions.numpy())
Exemple #18
0
 def _process_save_model_task_if_needed(self):
     (
         task,
         dataset,
     ) = self._task_data_service.get_save_model_task_and_dataset()
     if task is not None and dataset is not None:
         dataset = self._dataset_fn(
             dataset,
             Mode.PREDICTION,
             self._task_data_service.data_reader.metadata,
         )
         dataset = dataset.batch(self._minibatch_size)
         saved_model_path = task.extended_config.get(
             SaveModelConfig.SAVED_MODEL_PATH
         )
         saved_model_path = os.path.join(
             saved_model_path, str(int(time.time()))
         )
         logger.info(
             "The path to export model is {}".format(saved_model_path)
         )
         model = self._model_handler.get_model_to_export(
             self._model, dataset
         )
         tf.saved_model.save(model, saved_model_path)
         self.report_task_result(task_id=task.task_id, err_msg="")
Exemple #19
0
    def create_master(self, **kargs):
        env = [
            V1EnvVar(
                name="MY_POD_IP",
                value_from=V1EnvVarSource(field_ref=V1ObjectFieldSelector(
                    field_path="status.podIP")),
            )
        ]
        if "envs" in kargs:
            for key in kargs["envs"]:
                env.append(V1EnvVar(name=key, value=kargs["envs"][key]))

        pod = self._create_pod(
            pod_name=self.get_master_pod_name(),
            job_name=self.job_name,
            image_name=self._image_name,
            command=["python"],
            resource_requests=kargs["resource_requests"],
            resource_limits=kargs["resource_limits"],
            container_args=kargs["args"],
            pod_priority=kargs["pod_priority"],
            image_pull_policy=kargs["image_pull_policy"],
            restart_policy=kargs["restart_policy"],
            volume=kargs["volume"],
            owner_pod=None,
            env=env,
        )
        # Add replica type and index
        pod.metadata.labels[ELASTICDL_REPLICA_TYPE_KEY] = "master"
        pod.metadata.labels[ELASTICDL_REPLICA_INDEX_KEY] = "0"
        self.client.create_namespaced_pod(self.namespace, pod)
        logger.info("Master launched.")
Exemple #20
0
    def _gen(self):
        """
        A generator supports the iter() protocol (e.g. a generator function),
        used to create a `tf.data.Dataset` object from a list of tasks.
        """
        while True:
            # Make sure we also generate data from the warm-up task.
            if self._warm_up_task is not None and self._has_warmed_up:
                task = self._warm_up_task
                self._warm_up_task = None
            else:
                task = self._worker.get_task()
            if not task.shard_name:
                if task.type == elasticdl_pb2.WAIT:
                    self._pending_dataset = True
                    logger.info(
                        "Finish current dataset, maybe more data later"
                    )
                else:
                    logger.info("No more task, stopping")
                break
            with self._lock:
                if task.type == elasticdl_pb2.SAVE_MODEL:
                    self._pending_save_model_task = task
                    continue

                self._pending_tasks.append(task)
                if len(self._pending_tasks) == 1:
                    self._current_task = task
            for data in self.data_reader.read_records(task):
                if data:
                    yield data
Exemple #21
0
 def delete_master(self):
     logger.info("pod name is %s" % self.get_master_pod_name())
     self.client.delete_namespaced_pod(
         self.get_master_pod_name(),
         self.namespace,
         body=client.V1DeleteOptions(grace_period_seconds=0),
     )
Exemple #22
0
 def _start_worker(self, worker_id):
     logger.info("Starting worker: %d" % worker_id)
     bash_command = self._worker_args[1]
     bash_command += " --worker_id {}".format(worker_id)
     bash_command += " --ps_addrs {}".format(self._ps_addrs)
     if self._log_file_path:
         bash_command += BashCommandTemplate.REDIRECTION.format(
             self._log_file_path)
     worker_args = [self._worker_args[0], bash_command]
     with self._lock:
         pod = self._k8s_client.create_worker(
             worker_id=worker_id,
             resource_requests=self._worker_resource_request,
             resource_limits=self._worker_resource_limit,
             pod_priority=self._worker_pod_priority[worker_id],
             termination_period=1,
             volume=self._volume,
             image_pull_policy=self._image_pull_policy,
             command=self._worker_command,
             args=worker_args,
             restart_policy=self._restart_policy,
             ps_addrs=self._ps_addrs,
             envs=copy.deepcopy(self._envs),
             expose_ports=self._expose_ports,
         )
         name = pod.metadata.name
         self._worker_pod_name_to_id[name] = worker_id
         self._worker_pods_phase[worker_id] = (name, None)
         self._k8s_client.create_worker_service(worker_id)
Exemple #23
0
 def _replace_attr_with_edl_embedding(model):
     """Replace the keras embedding attributes in the model with
     `elasticdl.layers.Embedding` layers.
     """
     for name, value in model.__dict__.items():
         if type(value) == tf.keras.layers.Embedding:
             logger.info("Replace {} layer with "
                         "elasticdl.layers.Embedding".format(value))
             initializer_name = tf.keras.initializers.serialize(
                 value.embeddings_initializer)["class_name"]
             embedding_layer = Embedding(
                 output_dim=value.output_dim,
                 input_dim=value.input_dim,
                 embeddings_initializer=initializer_name,
                 mask_zero=value.mask_zero,
                 input_length=value.input_length,
             )
             setattr(model, name, embedding_layer)
         elif type(value) == SparseEmbedding:
             logger.info("Replace {} layer with "
                         "elasticdl.layers.Embedding".format(value))
             embedding_layer = Embedding(
                 output_dim=value.output_dim,
                 input_dim=value.input_dim,
                 embeddings_initializer=initializer_name,
                 combiner=value.combiner,
             )
             setattr(model, name, embedding_layer)
     return model
 def start_redis_service(self):
     args = self._parse_embedding_service_args()
     logger.info("Starting redis server on ports: %d - %d, "
                 "--cluster_node_timeout %d" % (
                     args.first_port,
                     args.first_port + args.num_of_redis_instances - 1,
                     args.cluster_node_timeout,
                 ))
     failed_port = []
     for i in range(args.num_of_redis_instances):
         port = args.first_port + i
         command = (
             "redis-server --port %d --cluster-enabled yes "
             "--cluster-config-file nodes-%d.conf --cluster-node-timeout"
             " %d --appendonly yes --appendfilename appendonly-%d.aof "
             "--dbfilename dump-%d.rdb --logfile %d.log --daemonize yes "
             "--protected-mode no" %
             (port, port, args.cluster_node_timeout, port, port, port))
         return_code = self._run_shell_command(command)
         if return_code:
             failed_port.append(port)
     if failed_port:
         local_ip = os.getenv("MY_POD_IP", "localhost")
         logger.info("%s starts these redis instances failed: %s" %
                     (local_ip, ";".join(map(str, failed_port))))
Exemple #25
0
def create_master_service(
    port,
    task_manager,
    pod_manager,
    rendezvous_server,
    evaluation_service,
):
    """Create GRPC server
    """
    logger.info("Creating master service")
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=64),
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )
    master_servicer = MasterServicer(
        task_manager=task_manager,
        instance_manager=pod_manager,
        rendezvous_server=rendezvous_server,
        evaluation_service=evaluation_service,
    )
    elasticai_api_pb2_grpc.add_MasterServicer_to_server(
        master_servicer, server)
    elasticdl_pb2_grpc.add_TrainLoopMasterServicer_to_server(
        master_servicer, server)
    server.add_insecure_port("[::]:{}".format(port))
    logger.info("The port of the master server is: %d", port)

    return server
Exemple #26
0
def print_args(args, groups=None):
    """
    Args:
        args: parsing results returned from `parser.parse_args`
        groups: It is a list of a list. It controls which options should be
        printed together. For example, we expect all model specifications such
        as `optimizer`, `loss` are better printed together.
        groups = [["optimizer", "loss"]]
    """
    def _get_attr(instance, attribute):
        try:
            return getattr(instance, attribute)
        except AttributeError:
            return None

    dedup = set()
    if groups:
        for group in groups:
            for element in group:
                dedup.add(element)
                logger.info("%s = %s", element, _get_attr(args, element))
    other_options = [(key, value) for (key, value) in args.__dict__.items()
                     if key not in dedup]
    for key, value in other_options:
        logger.info("%s = %s", key, value)
Exemple #27
0
 def request_stop(self, success, msg=""):
     self._stop_requested = True
     if success:
         self._exit_code = 0
         logger.info(msg)
     else:
         self._exit_code = 1
         logger.error(msg)
Exemple #28
0
    def _remove_ps(self, ps_id):
        logger.info("Removing PS: %d", ps_id)
        with self._lock:
            if ps_id not in self._ps_pods_phase:
                logger.error("Unknown PS id: %s" % ps_id)
                return

        self._k8s_client.delete_ps(ps_id)
Exemple #29
0
 def start_tensorboard_service(self):
     self._k8s_client.create_tensorboard_service()
     logger.info("Waiting for the URL for TensorBoard service...")
     tb_url = self._get_tensorboard_url()
     if tb_url:
         logger.info("TensorBoard service is available at: %s" % tb_url)
     else:
         logger.warning("Unable to get the URL for TensorBoard service")
Exemple #30
0
    def _remove_worker(self, worker_id):
        logger.info("Removing worker: %d", worker_id)
        with self._lock:
            if worker_id not in self._worker_pods_phase:
                logger.error("Unknown worker id: %s" % worker_id)
                return

        # TODO: change _k8s_client to accept pod name instead of worker id.
        self._k8s_client.delete_worker(worker_id)