コード例 #1
0
 def get_master_pod(self):
     try:
         return self.client.read_namespaced_pod(
             name=self.get_master_pod_name(), namespace=self.namespace)
     except client.api_client.ApiException as e:
         logger.warning("Exception when reading master pod: %s\n" % e)
         return None
コード例 #2
0
 def ReportTaskResult(self, request, _):
     if request.err_message:
         logger.warning("Worker reported error: " + request.err_message)
         self._task_d.report(request.task_id, False)
     else:
         self._task_d.report(request.task_id, True)
     return empty_pb2.Empty()
コード例 #3
0
    def _generate_lookup_keys(self, grads_and_vars):
        """Generate lookup keys from a list of (gradient, variable) pairs.

        Arguments:
            grads_and_vars: A list of (gradient, layer name) pairs.

        Returns:
            A tuple of (`embedding_keys`, `slot_keys`, `embedding_key_index`,
                `slot_key_index`).
            `embedding_keys`: A list of keys for embedding vectors in kv
                store.
            `slot_keys`: A list of keys for slots in kv store.
            `embedding_key_index`: A python dictionary records the position
                of embedding keys for the same layer, i.e. an item
                `{layer_name: (start, end)}` means `embedding_keys[start:end]`
                are keys for the same layer named `layer_name`.
            `slot_key_index`: A python dictionary records the position of slot
                keys for the same layer and the smae slot, i.e. an item
                `{layer_name: {slot_name: (start, end)}}` means
                `slot_keys[start:end]` are keys for the same layer named
                `layer_name` and same slot named `slot_name`.

        """
        embed_keys = []
        embed_key_index = {}
        slot_keys = []
        slot_key_index = {}
        self._unique_ids_all_layers = {}

        # generate keys
        for it, (grad, layer_name) in enumerate(grads_and_vars):
            # de-duplicate gradient's indices
            unique_ids, indices = tf.unique(grad.indices)
            unique_ids = unique_ids.numpy()
            if layer_name in self._unique_ids_all_layers:
                # TODO: support grads_and_vars with duplicated layer name
                logger.warning("grads_and_vars has duplicated layer name %s." %
                               layer_name)
            self._unique_ids_all_layers[layer_name] = unique_ids
            grad_new = tf.IndexedSlices(grad.values, indices)
            grads_and_vars[it] = (grad_new, layer_name)

            # generate embedding keys
            start = len(embed_keys)
            embed_keys.extend(
                [Embedding.get_key([layer_name, i]) for i in unique_ids])
            end = len(embed_keys)
            embed_key_index[layer_name] = (start, end)

            # generate slot keys
            for slot in self._allowed_slot_names:
                start = len(slot_keys)
                slot_keys.extend([
                    Embedding.get_key([layer_name, slot, i])
                    for i in unique_ids
                ])
                end = len(slot_keys)
                slot_key_index.setdefault(layer_name,
                                          {}).setdefault(slot, (start, end))
        return embed_keys, slot_keys, embed_key_index, slot_key_index
コード例 #4
0
    def _run_model_call_before_training(self, features):
        """Call `self._model.call` before training for two things:
            * Create variables and report to ps if not created.
            * Check whether there is an embedding layer that is called
              more than once during one forward-pass.
        """
        if self._embedding_layers:
            with tf.GradientTape() as tape:
                self._set_tape_for_embedding(tape)
                _ = self._model.call(features)
        else:
            _ = self._model.call(features)
        self._non_embed_vars = get_non_embedding_trainable_vars(
            self._model, self._embedding_layers)

        if not self._var_created:
            self.report_variable()
            self._var_created = True

        if self._need_embedding_layer_check:
            self._train_eagerly = False
            for layer in self._embedding_layers:
                if len(layer.embedding_and_ids) > 1:
                    self._train_eagerly = True
                    logger.warning(
                        "ElasticDL embedding layer %s is called more than "
                        "once, this will make the training process unable "
                        "to accelerate with tf.function." % (layer.name))
            self._need_embedding_layer_check = False

        self._reset_embedding()
コード例 #5
0
ファイル: servicer.py プロジェクト: gavinljj/elasticdl
 def _validate_model_version(self, request_model_version):
     if request_model_version > self._version:
         err_msg = ("Model version %d not available yet, "
                    "current version: %d" %
                    (request_model_version, self._version))
         logger.warning(err_msg)
         raise ValueError(err_msg)
     return request_model_version == self._version
コード例 #6
0
 def start_tensorboard_service(self):
     self._create_tensorboard_service()
     logger.info("Waiting for the URL for TensorBoard service...")
     tb_url = self._get_tensorboard_url()
     if tb_url:
         logger.info("TensorBoard service is available at: %s" % tb_url)
     else:
         logger.warning("Unable to get the URL for TensorBoard service")
コード例 #7
0
 def patch_labels_to_pod(self, pod_name, labels_dict):
     body = {"metadata": {"labels": labels_dict}}
     try:
         return self.client.patch_namespaced_pod(name=pod_name,
                                                 namespace=self.namespace,
                                                 body=body)
     except client.api_client.ApiException as e:
         logger.warning("Exception when patching labels to pod: %s\n" % e)
         return None
コード例 #8
0
 def get_worker_pod(self, worker_id):
     try:
         return self.client.read_namespaced_pod(
             name=self.get_worker_pod_name(worker_id),
             namespace=self.namespace,
         )
     except client.api_client.ApiException as e:
         logger.warning("Exception when reading worker pod: %s\n" % e)
         return None
コード例 #9
0
 def get_embedding_service_pod(self, embedding_service_id):
     try:
         return self.client.read_namespaced_pod(
             name=self.get_embedding_service_pod_name(embedding_service_id),
             namespace=self.namespace,
         )
     except client.api_client.ApiException as e:
         logger.warning(
             "Exception when reading embedding service pod: %s\n" % e)
         return None
コード例 #10
0
ファイル: worker.py プロジェクト: sorrycc/elasticdl
 def report_prediction_outputs(self, predictions):
     if self._prediction_outputs_processor:
         self._prediction_outputs_processor.process(predictions,
                                                    self._worker_id)
     else:
         logger.warning(
             "prediction_outputs_processor is not "
             "defined in the model definition. Prediction outputs "
             "are not processed.")
     return True
コード例 #11
0
 def _get_tensorboard_service(self):
     try:
         return self._k8s_client.client.read_namespaced_service(
             name=self._get_tensorboard_service_name(),
             namespace=self._k8s_client.namespace,
         ).to_dict()
     except client.api_client.ApiException as e:
         logger.warning("Exception when reading TensorBoard service: %s\n" %
                        e)
         return None
コード例 #12
0
ファイル: args.py プロジェクト: awesome-ml/elasticdl
def parse_master_args(master_args=None):
    parser = argparse.ArgumentParser(description="ElasticDL Master")
    parser.add_argument(
        "--port",
        default=50001,
        type=pos_int,
        help="The listening port of master",
    )
    parser.add_argument("--worker_image",
                        help="Docker image for workers",
                        default=None)
    parser.add_argument("--worker_pod_priority",
                        help="Priority requested by workers")
    parser.add_argument(
        "--prediction_data_dir",
        help="Prediction data directory. Files should be in RecordIO format",
        default="",
    )
    add_common_params(parser)
    add_train_params(parser)

    args, _ = parser.parse_known_args(args=master_args)
    print_args(args, groups=ALL_ARGS_GROUPS)
    logger.warning("Unknown arguments: %s", _)

    if all(v == "" or v is None for v in [
            args.training_data_dir,
            args.evaluation_data_dir,
            args.prediction_data_dir,
    ]):
        raise ValueError(
            "At least one of the data directories needs to be provided")

    if args.prediction_data_dir and (args.training_data_dir
                                     or args.evaluation_data_dir):
        raise ValueError(
            "Running prediction together with training or evaluation "
            "is not supported")
    if args.prediction_data_dir and not args.checkpoint_filename_for_init:
        raise ValueError(
            "checkpoint_filename_for_init is required for running "
            "prediction job")

    return args
コード例 #13
0
ファイル: model_helper.py プロジェクト: wwjiang007/elasticdl
def get_model_spec(
    model_zoo,
    model_def,
    model_params,
    dataset_fn,
    loss,
    optimizer,
    eval_metrics_fn,
    prediction_outputs_processor,
):
    """Get the model spec items in a tuple.

    The model spec tuple contains the following items in order:

    * The model object instantiated with parameters specified
      in `model_params`,
    * The `dataset_fn`,
    * The `loss`,
    * The `optimizer`,
    * The `eval_metrics_fn`,
    * The `prediction_outputs_processor`. Note that it will print
      warning if it's not inherited from `BasePredictionOutputsProcessor`.
    """
    model_def_module_file = get_module_file_path(model_zoo, model_def)
    default_module = load_module(model_def_module_file).__dict__
    model = load_model_from_module(model_def, default_module, model_params)
    prediction_outputs_processor = _get_spec_value(
        prediction_outputs_processor, model_zoo, default_module)
    if prediction_outputs_processor and not isinstance(
            prediction_outputs_processor, BasePredictionOutputsProcessor):
        logger.warning("prediction_outputs_processor is not "
                       "inherited from BasePredictionOutputsProcessor. "
                       "Prediction outputs may not be processed correctly.")
    return (
        model,
        _get_spec_value(dataset_fn, model_zoo, default_module, required=True),
        _get_spec_value(loss, model_zoo, default_module, required=True),
        _get_spec_value(optimizer, model_zoo, default_module, required=True),
        _get_spec_value(eval_metrics_fn,
                        model_zoo,
                        default_module,
                        required=True),
        prediction_outputs_processor,
    )
コード例 #14
0
ファイル: args.py プロジェクト: awesome-ml/elasticdl
def parse_worker_args(worker_args=None):
    parser = argparse.ArgumentParser(description="ElasticDL Worker")
    add_common_args_between_master_and_worker(parser)
    parser.add_argument("--worker_id",
                        help="Id unique to the worker",
                        type=int,
                        required=True)
    parser.add_argument("--job_type", help="Job type", required=True)
    parser.add_argument("--master_addr", help="Master ip:port", required=True)
    parser.add_argument(
        "--embedding_service_endpoint",
        type=str,
        default="{}",
        help="The endpoint of embedding service, "
        "e.g. \"{'ip_0': [port_0,port_1]}\"",
    )

    args, _ = parser.parse_known_args(args=worker_args)
    print_args(args, groups=ALL_ARGS_GROUPS)
    logger.warning("Unknown arguments: %s", _)
    return args
コード例 #15
0
 def _run_shell_command(self, command):
     retry_times = 0
     while retry_times <= Redis.MAX_COMMAND_RETRY_TIMES:
         if retry_times:
             logger.warning(
                 'Command: "%s" failed to run, retry times: %d .' %
                 (command, retry_times))
         redis_process = subprocess.Popen(
             [command],
             shell=True,
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
         )
         redis_process.wait()
         if not redis_process.returncode:
             break
         redis_process.kill()
         # Wait for retry
         time.sleep(1)
         retry_times += 1
     return redis_process.returncode
コード例 #16
0
    def report(self, task_id, success):
        """Report if the task is successful or not"""

        evaluation_task_completed = False
        with self._lock:
            _, task = self._doing.pop(task_id, (-1, None))
            if not task:
                logger.warning("Unknown task_id: %d" % task_id)
            elif not success:
                # TODO: keep count of retries.
                self._todo.append(task)
            elif (task.type == elasticdl_pb2.EVALUATION
                  and self._evaluation_service is not None):
                evaluation_task_completed = True
            else:
                logger.info(
                    "Task:%d completed, %d remaining tasks",
                    task_id,
                    len(self._todo) + len(self._doing),
                )
        if evaluation_task_completed:
            self._evaluation_service.complete_task()
コード例 #17
0
ファイル: odps_io.py プロジェクト: merlintang/elasticdl
    def read_batch(self, start, end, columns=None, max_retries=3):
        """
        Read ODPS table in chosen row range [ `start`, `end` ) with the
        specified columns `columns`.

        Args:
            start: The row index to start reading.
            end: The row index to end reading.
            columns: The list of column to read.
            max_retries : The maximum number of retries in case of exceptions.

        Returns:
            Two-dimension python list with shape: (end - start, len(columns))
        """
        retry_count = 0
        if columns is None:
            columns = self._odps_table.schema.names
        while retry_count < max_retries:
            try:
                batch_record = []
                with self._odps_table.open_reader(partition=self._partition,
                                                  reopen=True) as reader:
                    for record in reader.read(start=start,
                                              count=end - start,
                                              columns=columns):
                        batch_record.append(
                            [record[column] for column in columns])
                return batch_record
            except Exception as e:
                if retry_count >= max_retries:
                    raise Exception("Exceeded maximum number of retries")
                logger.warning("ODPS read exception {} for {} in {}."
                               "Retrying time: {}".format(
                                   e, columns, self._table, retry_count))
                time.sleep(5)
                retry_count += 1
コード例 #18
0
    def ReportGradient(self, request, _):
        model_version_valid = self._validate_model_version(
            request.model_version
        )

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        # TODO: Update task queue with task_id
        with self._lock:
            tmp = {}
            indexed_grads = {}
            edl_embedding_gradients = {}
            # Do sanity check before accumulating gradients.
            for k, v in request.gradient.items():
                if k not in self._model:
                    if v.indices:
                        # grads of ElasticDL Embedding layer
                        # TODO: check arr.shape[1] = embedding_dim of this
                        # EdlEmbedding layer
                        arr = tensor_to_ndarray(v)
                        edl_embedding_gradients[k] = arr
                        continue
                    else:
                        raise ValueError(
                            "Gradient key: %s is not part of model", k
                        )

                arr = tensor_to_ndarray(v)
                if isinstance(arr, tf.IndexedSlices):
                    if arr.values.shape[1] != self._model[k].numpy().shape[1]:
                        raise ValueError(
                            "Gradient key: %s has incompatible "
                            "indexed slice dimension %d, expected %d"
                            % (
                                k,
                                arr.values.shape[1],
                                self._model[k].numpy().shape[1],
                            )
                        )

                    max_index = tf.math.reduce_max(arr.indices).numpy()
                    if max_index >= self._model[k].numpy().shape[0]:
                        raise ValueError(
                            "Gradient key: %s has wrong indices %d, "
                            "out of range %d"
                            % (
                                k,
                                max_index,
                                self._model[k].numpy().shape[0] - 1,
                            )
                        )
                    indexed_grads[k] = arr
                else:
                    if arr.shape != self._model[k].numpy().shape:
                        raise ValueError(
                            "Gradient key: %s has incompatible dimension", k
                        )
                    tmp[k] = arr

            # grads of ElasticDL Embedding layer
            for k, v in edl_embedding_gradients.items():
                if k in self._edl_embedding_gradients:
                    self._edl_embedding_gradients[k] = merge_indexed_slices(
                        self._edl_embedding_gradients[k], v
                    )
                else:
                    self._edl_embedding_gradients[k] = v

            # grads of Keras Embedding layer
            for k, v in indexed_grads.items():
                if k not in self._gradient_sum_indexed:
                    self._gradient_sum_indexed[k] = v
                else:
                    grads_s = self._gradient_sum_indexed[k]
                    self._gradient_sum_indexed[k] = merge_indexed_slices(
                        grads_s, v
                    )

            # other grads
            for k, v in tmp.items():
                if not self._use_async and k in self._gradient_sum:
                    self._gradient_sum[k] = self._gradient_sum[k] + v
                else:
                    self._gradient_sum[k] = v

            self._grad_n += 1
            if self._use_async or self._grad_n >= self._grad_to_wait:
                self._update_model()
                self._update_evaluation()
                self._update_checkpoint()

        res.accepted = True
        res.model_version = self._version
        return res
コード例 #19
0
    def ReportGradient(self, request, _):
        model_version_valid = self._use_async or self._validate_model_version(
            request.model_version
        )

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        tmp = {}
        indexed_grads = {}
        edl_embedding_gradients = {}
        # Do sanity check before accumulating gradients.
        for k, v in request.gradient.items():
            if k not in self._model:
                if v.indices:
                    # grads of ElasticDL Embedding layer
                    # TODO: check arr.shape[1] = embedding_dim of this
                    # EdlEmbedding layer
                    arr = tensor_to_ndarray(v)
                    edl_embedding_gradients[k] = arr
                    continue
                else:
                    raise ValueError(
                        "Gradient key: %s is not part of model", k
                    )

            arr = tensor_to_ndarray(v)
            if isinstance(arr, tf.IndexedSlices):
                if arr.values.shape[1] != self._model[k].numpy().shape[1]:
                    raise ValueError(
                        "Gradient key: %s has incompatible "
                        "indexed slice dimension %d, expected %d"
                        % (
                            k,
                            arr.values.shape[1],
                            self._model[k].numpy().shape[1],
                        )
                    )

                max_index = tf.math.reduce_max(arr.indices).numpy()
                if max_index >= self._model[k].numpy().shape[0]:
                    raise ValueError(
                        "Gradient key: %s has wrong indices %d, "
                        "out of range %d"
                        % (k, max_index, self._model[k].numpy().shape[0] - 1)
                    )
                indexed_grads[k] = arr
            else:
                if arr.shape != self._model[k].numpy().shape:
                    raise ValueError(
                        "Gradient key: %s has incompatible dimension", k
                    )
                tmp[k] = arr

        if not self._use_async:
            self._lock.acquire()
        self._process_gradients(
            edl_embedding_gradients, indexed_grads, tmp, request.model_version
        )
        if not self._use_async:
            self._lock.release()

        res.accepted = True
        res.model_version = self._version
        return res
コード例 #20
0
ファイル: odps_io.py プロジェクト: yupbank/elasticdl
def _read_odps_one_shot(
    project,
    access_id,
    access_key,
    endpoint,
    table,
    partition,
    start,
    end,
    columns,
    max_retries=3,
):
    """
    Read ODPS table in chosen row range [ `start`, `end` ) with the specified
    columns `columns`.

    Args:
        project: The ODPS project.
        access_id: The ODPS user access ID.
        access_key: The ODPS user access key.
        endpoint: The ODPS cluster endpoint.
        table: The ODPS table name.
        partition: The ODPS table's partition. Default is `None` if the
            table is not partitioned.
        start: The row index to start reading.
        end: The row index to end reading.
        columns: The list of column to read.
        max_retries : The maximum number of retries in case of exceptions.

    Returns: Two-dimension python list with shape: (end - start, len(columns))
    """
    odps_table = ODPS(access_id, access_key, project, endpoint).get_table(
        table
    )

    retry_count = 0

    while retry_count < max_retries:
        try:
            batch_record = []
            with odps_table.open_reader(
                partition=partition, reopen=True
            ) as reader:
                for record in reader.read(
                    start=start, count=end - start, columns=columns
                ):
                    batch_record.append([record[column] for column in columns])

            return batch_record

        except Exception as e:
            import time

            if retry_count >= max_retries:
                raise
            logger.warning(
                "ODPS read exception {} for {} in {}. retrying {} time".format(
                    e, columns, table, retry_count
                )
            )
            time.sleep(5)
            retry_count += 1