Example #1
0
def print_args(args, groups=None):
    """
    Args:
        args: parsing results returned from `parser.parse_args`
        groups: It is a list of a list. It controls which options should be
        printed together. For example, we expect all model specifications such
        as `optimizer`, `loss` are better printed together.
        groups = [["optimizer", "loss"]]
    """
    def _get_attr(instance, attribute):
        try:
            return getattr(instance, attribute)
        except AttributeError:
            return None

    dedup = set()
    if groups:
        for group in groups:
            for element in group:
                dedup.add(element)
                logger.info("%s = %s", element, _get_attr(args, element))
    other_options = [(key, value) for (key, value) in args.__dict__.items()
                     if key not in dedup]
    for key, value in other_options:
        logger.info("%s = %s", key, value)
    def _event_cb(self, event):
        evt_obj = event.get("object")
        evt_type = event.get("type")
        if not evt_obj or not evt_type:
            logger.error("Event doesn't have object or type: %s" % event)
            return

        pod_name = evt_obj.metadata.name
        phase = evt_obj.status.phase
        logger.info("Got event %s, phase %s for pod: %s" %
                    (evt_type, phase, pod_name))

        relaunch = False
        with self._lock:
            worker_id = self._pod_name_to_id.get(pod_name)
            if (worker_id is None
                    and pod_name != self._k8s_client.get_master_pod_name()):
                logger.error("Unknown worker pod name: %s" % pod_name)
                return

            self._pods_phase[worker_id] = (pod_name, phase)
            if evt_type == "DELETED":
                del self._pods_phase[worker_id]
                del self._pod_name_to_id[pod_name]
                self._task_d.recover_tasks(worker_id)

                # If a deleted pod was not "Succeeded", relaunch a worker.
                relaunch = (self._relaunch_deleted_live_worker
                            and phase != "Succeeded")
        if relaunch:
            logger.info("Relaunching worker.")
            self._start_worker(self._next_worker_id())
Example #3
0
    def create_master(self, **kargs):
        env = [
            V1EnvVar(
                name="MY_POD_IP",
                value_from=V1EnvVarSource(field_ref=V1ObjectFieldSelector(
                    field_path="status.podIP")),
            )
        ]
        if "envs" in kargs:
            for key in kargs["envs"]:
                env.append(V1EnvVar(name=key, value=kargs["envs"][key]))

        pod = self._create_pod(
            pod_name=self.get_master_pod_name(),
            job_name=self.job_name,
            image_name=self._image_name,
            command=["python"],
            resource_requests=kargs["resource_requests"],
            resource_limits=kargs["resource_limits"],
            container_args=kargs["args"],
            pod_priority=kargs["pod_priority"],
            image_pull_policy=kargs["image_pull_policy"],
            restart_policy=kargs["restart_policy"],
            volume=kargs["volume"],
            owner_pod=None,
            env=env,
        )
        # Add replica type and index
        pod.metadata.labels[ELASTICDL_REPLICA_TYPE_KEY] = "master"
        pod.metadata.labels[ELASTICDL_REPLICA_INDEX_KEY] = "0"
        self.client.create_namespaced_pod(self.namespace, pod)
        logger.info("Master launched.")
Example #4
0
 def delete_master(self):
     logger.info("pod name is %s" % self.get_master_pod_name())
     self.client.delete_namespaced_pod(
         self.get_master_pod_name(),
         self.namespace,
         body=client.V1DeleteOptions(grace_period_seconds=0),
     )
 def process(self, predictions, worker_id):
     if self.odps_writer:
         self.odps_writer.from_iterator(
             iter(predictions.numpy().tolist()), worker_id
         )
     else:
         logger.info(predictions.numpy())
Example #6
0
 def _gen(self):
     """
     A generator supports the iter() protocol (e.g. a generator function),
     which is used to create a dataset for RecordIO.
     """
     while True:
         task = self._worker.get_task()
         if not task.shard_file_name:
             if task.type == elasticdl_pb2.WAIT:
                 self._pending_dataset = True
                 logger.info(
                     "Finish current dataset, maybe more data later")
             else:
                 logger.info("No more task, stopping")
             break
         with self._lock:
             if (self._training_with_evaluation
                     and task.type == elasticdl_pb2.EVALUATION):
                 self._pending_eval_tasks.append(task)
                 continue
             self._record_count += task.end - task.start
             self._pending_tasks_with_counts.append(
                 (task, self._record_count))
             if len(self._pending_tasks_with_counts) == 1:
                 self._current_task = task
         with closing(
                 recordio.Scanner(task.shard_file_name, task.start,
                                  task.end - task.start)) as reader:
             while True:
                 record = reader.record()
                 if record:
                     yield record
                 else:
                     break
    def stop_embedding_service(self, save="nosave"):
        failed_redis_nodes = []
        for redis_node in [
            "-h %s -p %d" % (ip, port)
            for ip, port_list in self._embedding_service_endpoint.items()
            for port in port_list
        ]:
            command = "redis-cli %s shutdown %s" % (redis_node, save)
            return_code = self._run_shell_command(command)
            if return_code:
                failed_redis_nodes.append(redis_node)

        if failed_redis_nodes:
            failed_redis_nodes = [i.split(" ") for i in failed_redis_nodes]
            logger.info(
                "Stop these redis nodes failed: %s."
                % ";".join(
                    [
                        "%s:%s" % (redis_node[1], redis_node[3])
                        for redis_node in failed_redis_nodes
                    ]
                )
            )
            return False

        return True
Example #8
0
 def _gen(self):
     """
     A generator supports the iter() protocol (e.g. a generator function),
     used to create a `tf.data.Dataset` from a list of tasks.
     """
     while True:
         task = self._worker.get_task()
         if not task.shard_name:
             if task.type == elasticdl_pb2.WAIT:
                 self._pending_dataset = True
                 logger.info(
                     "Finish current dataset, maybe more data later")
             else:
                 logger.info("No more task, stopping")
             break
         with self._lock:
             if (self._training_with_evaluation
                     and task.type == elasticdl_pb2.EVALUATION):
                 self._pending_eval_tasks.append(task)
                 continue
             self._record_count += task.end - task.start
             self._pending_tasks_with_counts.append(
                 (task, self._record_count))
             if len(self._pending_tasks_with_counts) == 1:
                 self._current_task = task
         for data in self._data_reader.read_records(task):
             if data:
                 yield data
Example #9
0
 def start_redis_service(self):
     args = self._parse_embedding_service_args()
     logger.info("Starting redis server on ports: %d - %d, "
                 "--cluster_node_timeout %d" % (
                     args.first_port,
                     args.first_port + args.num_of_redis_instances - 1,
                     args.cluster_node_timeout,
                 ))
     failed_port = []
     for i in range(args.num_of_redis_instances):
         port = args.first_port + i
         command = (
             "redis-server --port %d --cluster-enabled yes "
             "--cluster-config-file nodes-%d.conf --cluster-node-timeout"
             " %d --appendonly yes --appendfilename appendonly-%d.aof "
             "--dbfilename dump-%d.rdb --logfile %d.log --daemonize yes "
             "--protected-mode no" %
             (port, port, args.cluster_node_timeout, port, port, port))
         return_code = self._run_shell_command(command)
         if return_code:
             failed_port.append(port)
     if failed_port:
         local_ip = os.getenv("MY_POD_IP", "localhost")
         logger.info("%s starts these redis instances failed: %s" %
                     (local_ip, ";".join(map(str, failed_port))))
Example #10
0
 def create_tasks(self, task_type, model_version=-1):
     logger.info(
         "Creating a new set of %s tasks for model version %d",
         elasticdl_pb2._TASKTYPE.values_by_number[task_type].name.lower(),
         model_version,
     )
     if task_type == elasticdl_pb2.TRAINING:
         shards = self._training_shards
     elif task_type == elasticdl_pb2.EVALUATION:
         shards = self._evaluation_shards
     else:
         shards = self._prediction_shards
     tasks = []
     for name, num_records in shards.items():
         for start in range(0, num_records, self._records_per_task):
             tasks.append(
                 _Task(
                     shard_name=name,
                     start=start,
                     end=min(start + self._records_per_task, num_records),
                     type=task_type,
                     model_version=model_version,
                 ))
     if task_type == elasticdl_pb2.TRAINING:
         random.shuffle(tasks)
         self._todo.extend(tasks)
     else:
         with self._lock:
             self._todo.extend(tasks)
     return tasks
Example #11
0
 def create_training_tasks(self):
     logger.info("Creating a new set of training tasks with epoch=%d",
                 self._epoch)
     tasks = self._create_tasks(self._training_shards,
                                elasticdl_pb2.TRAINING)
     random.shuffle(tasks)
     self._todo.extend(tasks)
     return tasks
Example #12
0
 def start_tensorboard_service(self):
     self._create_tensorboard_service()
     logger.info("Waiting for the URL for TensorBoard service...")
     tb_url = self._get_tensorboard_url()
     if tb_url:
         logger.info("TensorBoard service is available at: %s" % tb_url)
     else:
         logger.warning("Unable to get the URL for TensorBoard service")
Example #13
0
def _print_docker_progress(line):
    error = line.get("error", None)
    if error:
        raise RuntimeError("Docker image build: " + error)
    stream = line.get("stream", None)
    if stream:
        logger.info(stream)
    else:
        logger.info(line)
    def _remove_worker(self, worker_id):
        logger.info("Removing worker: %d", worker_id)
        with self._lock:
            if worker_id not in self._pods_phase:
                logger.error("Unknown worker id: %s" % worker_id)
                return

        # TODO: change _k8s_client to accept pod name instead of worker id.
        self._k8s_client.delete_worker(worker_id)
Example #15
0
def _build_docker_image(client, ctx_dir, dockerfile, image_name):
    logger.info("===== Building Docker Image =====")
    for line in client.build(
            dockerfile=dockerfile,
            path=ctx_dir,
            rm=True,
            tag=image_name,
            decode=True,
    ):
        _print_docker_progress(line)
Example #16
0
def copy_if_not_exists(src, dst, is_dir):
    if os.path.exists(dst):
        logger.info(
            "Skip copying from %s to %s since the destination already exists"
            % (src, dst)
        )
    else:
        if is_dir:
            shutil.copytree(src, dst)
        else:
            shutil.copy(src, dst)
Example #17
0
 def _init_model(self, checkpoint_filename_for_init, init_var):
     if checkpoint_filename_for_init:
         pb_model = load_from_checkpoint_file(checkpoint_filename_for_init)
         self._version = pb_model.version
         self._init_model_from_tensor_dict(pb_model.param)
     elif init_var:
         self._init_model_from_var_list(init_var)
     else:
         logger.info("Model is not intialized. It will be "
                     "initialized by the first update from "
                     "the worker.")
Example #18
0
 def create_evaluation_tasks(self, eval_model_version):
     logger.info(
         "Creating a new set of evaluation tasks for model version %d",
         eval_model_version,
     )
     tasks = self._create_tasks(
         self._evaluation_shards,
         elasticdl_pb2.EVALUATION,
         eval_model_version,
     )
     with self._lock:
         self._todo.extend(tasks)
     return tasks
Example #19
0
 def create_prediction_tasks(self, predict_model_version):
     logger.info(
         "Creating a new set of prediction tasks for model version %d",
         predict_model_version,
     )
     tasks = self._create_tasks(
         self._prediction_shards,
         elasticdl_pb2.PREDICTION,
         predict_model_version,
     )
     with self._lock:
         self._todo.extend(tasks)
     return tasks
Example #20
0
 def _process_minibatch(
     self,
     task_type,
     features,
     labels,
     min_model_version,
     train_with_local_model=False,
 ):
     if self._need_embedding_layer_check or not self._var_created:
         self._run_model_call_before_training(features)
     for _ in range(self._max_minibatch_retry_num):
         if task_type == elasticdl_pb2.EVALUATION:
             if min_model_version == -1:
                 if self._model_version < 0:
                     self.get_model(0, elasticdl_pb2.MINIMUM)
             elif self._model_version != min_model_version:
                 self.get_model(min_model_version, elasticdl_pb2.FIXED)
             accepted = self._run_evaluation_task(features, labels)
             if accepted:
                 break
         elif task_type == elasticdl_pb2.TRAINING:
             # TODO: optimize the logic to avoid unnecessary
             #       get_model call.
             if not train_with_local_model:
                 self.get_model(
                     max(self._model_version, min_model_version),
                     elasticdl_pb2.MINIMUM,
                 )
             accepted, min_model_version, loss = self._run_training_task(
                 features, labels)
             if accepted:
                 logger.info("Loss is %f" % loss.numpy())
                 break
         elif task_type == elasticdl_pb2.PREDICTION:
             if self._model_version != min_model_version:
                 self.get_model(min_model_version, elasticdl_pb2.FIXED)
             accepted = self._run_prediction_task(features)
             if accepted:
                 break
         else:
             raise RuntimeError("Unrecognized task type, %s" % task_type)
     else:
         # Worker got stuck, fail the task.
         # TODO: stop the worker if it fails to make any
         #       progress for some time.
         raise RuntimeError("Worker got stuck")
     return min_model_version
Example #21
0
 def _save_checkpoint(self, locking, is_eval_checkpoint):
     try:
         logger.info("Saving checkpoint for model version %d" %
                     self._version)
         if locking:
             self._lock.acquire()
         pb_model = self._get_model_no_lock()
         self._checkpoint_service.save(self._version, pb_model,
                                       is_eval_checkpoint)
         checkpoint_version = self._version
         if locking:
             self._lock.release()
         return checkpoint_version
     except Exception:
         logger.error(
             "Failed to save checkpoint file for model version %d" %
             self._version)
 def complete_task(self):
     self._eval_job.complete_task()
     if self._eval_job.finished():
         evaluation_metrics = self._eval_job.get_evaluation_summary()
         if self._tensorboard_service and evaluation_metrics:
             self._tensorboard_service.write_dict_to_summary(
                 evaluation_metrics, version=self._eval_job.model_version)
         logger.info("Evaluation metrics[v=%d]: %s" % (
             self._eval_job.model_version if self._eval_job.model_version >=
             0 else self._master_servicer.get_model_version(),
             str(evaluation_metrics),
         ))
         if not self._eval_only:
             # delete checkpoint file
             self._checkpoint_service.remove_eval_checkpoint(
                 self._eval_job.model_version)
             self._eval_job = None
             # create new eval job if possible
             self.try_to_create_new_job()
 def _start_worker(self, worker_id):
     logger.info("Starting worker: %d" % worker_id)
     with self._lock:
         pod = self._k8s_client.create_worker(
             worker_id=worker_id,
             resource_requests=self._resource_requests,
             resource_limits=self._resource_limits,
             pod_priority=self._pod_priority,
             volume=self._volume,
             image_pull_policy=self._image_pull_policy,
             command=self._command,
             args=self._args +
             ["--worker_id", str(worker_id)],
             restart_policy=self._restart_policy,
             envs=self._envs,
         )
         name = pod.metadata.name
         self._pod_name_to_id[name] = worker_id
         self._pods_phase[worker_id] = (name, None)
Example #24
0
    def __init__(
        self,
        training_shards,
        evaluation_shards,
        prediction_shards,
        records_per_task,
        num_epochs,
    ):
        """
        Arguments:
            training_shards: A dictionary from RecordIO file name to the
                number of training records.
            evaluation_shards: A dictionary from RecordIO file name to
                the number of evaluation records.
            prediction_shards: A dictionary from RecordIO file name to
                the number of prediction records.
            records_per_task: The number of records per task.
            num_epochs: The total number of epochs for the tasks where
                an epoch is a complete iteration over the shards.
        """
        self._lock = threading.Lock()

        self._num_epochs = num_epochs
        self._epoch = 0
        self._training_shards = training_shards
        self._evaluation_shards = evaluation_shards
        self._prediction_shards = prediction_shards
        self._records_per_task = records_per_task

        self._todo = []
        # dictionary from task id to Task.
        self._doing = {}
        self._task_id = 0
        self._evaluation_service = None

        if self._training_shards:
            logger.info("Starting epoch %d", self._epoch)
            self.create_tasks(elasticdl_pb2.TRAINING)
        elif self._evaluation_shards:
            self.create_tasks(elasticdl_pb2.EVALUATION)
        elif self._prediction_shards:
            self.create_tasks(elasticdl_pb2.PREDICTION)
Example #25
0
def _submit_job(image_name, client_args, container_args):
    client = k8s.Client(
        image_name=image_name,
        namespace=client_args.namespace,
        job_name=client_args.job_name,
        event_callback=None,
        cluster_spec=client_args.cluster_spec,
    )

    client.create_master(
        resource_requests=client_args.master_resource_request,
        resource_limits=client_args.master_resource_limit,
        args=container_args,
        pod_priority=client_args.master_pod_priority,
        image_pull_policy=client_args.image_pull_policy,
        restart_policy=client_args.restart_policy,
        volume=client_args.volume,
        envs=parse_envs(client_args.envs),
    )
    logger.info(
        "ElasticDL job %s was successfully submitted. The master pod is: %s." %
        (client_args.job_name, client.get_master_pod_name()))
Example #26
0
    def get(self, worker_id):
        """Return next (task_id, Task) tuple"""

        with self._lock:
            # TODO: check if task queue doesn't have training task,
            #       to avoid the queue is overwhelmed by evaluation tasks.
            if not self._todo and self._epoch < self._num_epochs - 1:
                # Start a new epoch
                self._epoch += 1
                self.create_tasks(elasticdl_pb2.TRAINING)
                logger.info("Starting epoch %d", self._epoch)

            if not self._todo:
                # No more tasks
                return -1, None

            self._task_id += 1
            task = self._todo.pop()
            # TODO: Handle timeout of tasks.
            self._doing[self._task_id] = (worker_id, task)

            return self._task_id, task
Example #27
0
    def report(self, task_id, success):
        """Report if the task is successful or not"""

        evaluation_task_completed = False
        with self._lock:
            _, task = self._doing.pop(task_id, (-1, None))
            if not task:
                logger.warning("Unknown task_id: %d" % task_id)
            elif not success:
                # TODO: keep count of retries.
                self._todo.append(task)
            elif (task.type == elasticdl_pb2.EVALUATION
                  and self._evaluation_service is not None):
                evaluation_task_completed = True
            else:
                logger.info(
                    "Task:%d completed, %d remaining tasks",
                    task_id,
                    len(self._todo) + len(self._doing),
                )
        if evaluation_task_completed:
            self._evaluation_service.complete_task()
Example #28
0
    def start_embedding_pod_and_redis(
        self,
        command,
        args,
        embedding_service_id=0,
        resource_request="cpu=1,memory=4096Mi",
        resource_limit="cpu=1,memory=4096Mi",
        pod_priority=None,
        volume=None,
        image_pull_policy=None,
        restart_policy="Never",
        **kargs,
    ):
        logger.info("Starting pod for embedding service ...")
        self._k8s_client = k8s.Client(event_callback=None, **kargs)
        pod = self._k8s_client.create_embedding_service(
            worker_id=embedding_service_id,
            resource_requests=resource_request,
            resource_limits=resource_limit,
            pod_priority=pod_priority,
            volume=volume,
            image_pull_policy=image_pull_policy,
            command=command,
            args=args,
            restart_policy=restart_policy,
        )

        # TODO: assign address with pod's domain name instead of pod's ip.
        # and should not fix ports
        address_ip = pod.status.pod_ip
        while not address_ip:
            pod = self._k8s_client.get_embedding_service_pod(
                embedding_service_id)
            address_ip = pod.status.pod_ip
        self._embedding_service_endpoint = {
            address_ip: [30001 + i for i in range(6)]
        }
Example #29
0
def write_to_recordio(filename, data_list):
    logger.info("Writing to file:", filename)
    with closing(recordio.Writer(filename)) as f:
        for d in data_list:
            f.write(d)
Example #30
0
def _push_docker_image(client, image_name):
    logger.info("===== Pushing Docker Image =====")
    for line in client.push(image_name, stream=True, decode=True):
        _print_docker_progress(line)