def get_master_pod(self): try: return self.client.read_namespaced_pod( name=self.get_master_pod_name(), namespace=self.namespace) except client.api_client.ApiException as e: logger.warning("Exception when reading master pod: %s\n" % e) return None
def ReportTaskResult(self, request, _): if request.err_message: logger.warning("Worker reported error: " + request.err_message) self._task_d.report(request.task_id, False) else: self._task_d.report(request.task_id, True) return empty_pb2.Empty()
def _generate_lookup_keys(self, grads_and_vars): """Generate lookup keys from a list of (gradient, variable) pairs. Arguments: grads_and_vars: A list of (gradient, layer name) pairs. Returns: A tuple of (`embedding_keys`, `slot_keys`, `embedding_key_index`, `slot_key_index`). `embedding_keys`: A list of keys for embedding vectors in kv store. `slot_keys`: A list of keys for slots in kv store. `embedding_key_index`: A python dictionary records the position of embedding keys for the same layer, i.e. an item `{layer_name: (start, end)}` means `embedding_keys[start:end]` are keys for the same layer named `layer_name`. `slot_key_index`: A python dictionary records the position of slot keys for the same layer and the smae slot, i.e. an item `{layer_name: {slot_name: (start, end)}}` means `slot_keys[start:end]` are keys for the same layer named `layer_name` and same slot named `slot_name`. """ embed_keys = [] embed_key_index = {} slot_keys = [] slot_key_index = {} self._unique_ids_all_layers = {} # generate keys for it, (grad, layer_name) in enumerate(grads_and_vars): # de-duplicate gradient's indices unique_ids, indices = tf.unique(grad.indices) unique_ids = unique_ids.numpy() if layer_name in self._unique_ids_all_layers: # TODO: support grads_and_vars with duplicated layer name logger.warning("grads_and_vars has duplicated layer name %s." % layer_name) self._unique_ids_all_layers[layer_name] = unique_ids grad_new = tf.IndexedSlices(grad.values, indices) grads_and_vars[it] = (grad_new, layer_name) # generate embedding keys start = len(embed_keys) embed_keys.extend( [Embedding.get_key([layer_name, i]) for i in unique_ids]) end = len(embed_keys) embed_key_index[layer_name] = (start, end) # generate slot keys for slot in self._allowed_slot_names: start = len(slot_keys) slot_keys.extend([ Embedding.get_key([layer_name, slot, i]) for i in unique_ids ]) end = len(slot_keys) slot_key_index.setdefault(layer_name, {}).setdefault(slot, (start, end)) return embed_keys, slot_keys, embed_key_index, slot_key_index
def _run_model_call_before_training(self, features): """Call `self._model.call` before training for two things: * Create variables and report to ps if not created. * Check whether there is an embedding layer that is called more than once during one forward-pass. """ if self._embedding_layers: with tf.GradientTape() as tape: self._set_tape_for_embedding(tape) _ = self._model.call(features) else: _ = self._model.call(features) self._non_embed_vars = get_non_embedding_trainable_vars( self._model, self._embedding_layers) if not self._var_created: self.report_variable() self._var_created = True if self._need_embedding_layer_check: self._train_eagerly = False for layer in self._embedding_layers: if len(layer.embedding_and_ids) > 1: self._train_eagerly = True logger.warning( "ElasticDL embedding layer %s is called more than " "once, this will make the training process unable " "to accelerate with tf.function." % (layer.name)) self._need_embedding_layer_check = False self._reset_embedding()
def _validate_model_version(self, request_model_version): if request_model_version > self._version: err_msg = ("Model version %d not available yet, " "current version: %d" % (request_model_version, self._version)) logger.warning(err_msg) raise ValueError(err_msg) return request_model_version == self._version
def start_tensorboard_service(self): self._create_tensorboard_service() logger.info("Waiting for the URL for TensorBoard service...") tb_url = self._get_tensorboard_url() if tb_url: logger.info("TensorBoard service is available at: %s" % tb_url) else: logger.warning("Unable to get the URL for TensorBoard service")
def patch_labels_to_pod(self, pod_name, labels_dict): body = {"metadata": {"labels": labels_dict}} try: return self.client.patch_namespaced_pod(name=pod_name, namespace=self.namespace, body=body) except client.api_client.ApiException as e: logger.warning("Exception when patching labels to pod: %s\n" % e) return None
def get_worker_pod(self, worker_id): try: return self.client.read_namespaced_pod( name=self.get_worker_pod_name(worker_id), namespace=self.namespace, ) except client.api_client.ApiException as e: logger.warning("Exception when reading worker pod: %s\n" % e) return None
def get_embedding_service_pod(self, embedding_service_id): try: return self.client.read_namespaced_pod( name=self.get_embedding_service_pod_name(embedding_service_id), namespace=self.namespace, ) except client.api_client.ApiException as e: logger.warning( "Exception when reading embedding service pod: %s\n" % e) return None
def report_prediction_outputs(self, predictions): if self._prediction_outputs_processor: self._prediction_outputs_processor.process(predictions, self._worker_id) else: logger.warning( "prediction_outputs_processor is not " "defined in the model definition. Prediction outputs " "are not processed.") return True
def _get_tensorboard_service(self): try: return self._k8s_client.client.read_namespaced_service( name=self._get_tensorboard_service_name(), namespace=self._k8s_client.namespace, ).to_dict() except client.api_client.ApiException as e: logger.warning("Exception when reading TensorBoard service: %s\n" % e) return None
def parse_master_args(master_args=None): parser = argparse.ArgumentParser(description="ElasticDL Master") parser.add_argument( "--port", default=50001, type=pos_int, help="The listening port of master", ) parser.add_argument("--worker_image", help="Docker image for workers", default=None) parser.add_argument("--worker_pod_priority", help="Priority requested by workers") parser.add_argument( "--prediction_data_dir", help="Prediction data directory. Files should be in RecordIO format", default="", ) add_common_params(parser) add_train_params(parser) args, _ = parser.parse_known_args(args=master_args) print_args(args, groups=ALL_ARGS_GROUPS) logger.warning("Unknown arguments: %s", _) if all(v == "" or v is None for v in [ args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ]): raise ValueError( "At least one of the data directories needs to be provided") if args.prediction_data_dir and (args.training_data_dir or args.evaluation_data_dir): raise ValueError( "Running prediction together with training or evaluation " "is not supported") if args.prediction_data_dir and not args.checkpoint_filename_for_init: raise ValueError( "checkpoint_filename_for_init is required for running " "prediction job") return args
def get_model_spec( model_zoo, model_def, model_params, dataset_fn, loss, optimizer, eval_metrics_fn, prediction_outputs_processor, ): """Get the model spec items in a tuple. The model spec tuple contains the following items in order: * The model object instantiated with parameters specified in `model_params`, * The `dataset_fn`, * The `loss`, * The `optimizer`, * The `eval_metrics_fn`, * The `prediction_outputs_processor`. Note that it will print warning if it's not inherited from `BasePredictionOutputsProcessor`. """ model_def_module_file = get_module_file_path(model_zoo, model_def) default_module = load_module(model_def_module_file).__dict__ model = load_model_from_module(model_def, default_module, model_params) prediction_outputs_processor = _get_spec_value( prediction_outputs_processor, model_zoo, default_module) if prediction_outputs_processor and not isinstance( prediction_outputs_processor, BasePredictionOutputsProcessor): logger.warning("prediction_outputs_processor is not " "inherited from BasePredictionOutputsProcessor. " "Prediction outputs may not be processed correctly.") return ( model, _get_spec_value(dataset_fn, model_zoo, default_module, required=True), _get_spec_value(loss, model_zoo, default_module, required=True), _get_spec_value(optimizer, model_zoo, default_module, required=True), _get_spec_value(eval_metrics_fn, model_zoo, default_module, required=True), prediction_outputs_processor, )
def parse_worker_args(worker_args=None): parser = argparse.ArgumentParser(description="ElasticDL Worker") add_common_args_between_master_and_worker(parser) parser.add_argument("--worker_id", help="Id unique to the worker", type=int, required=True) parser.add_argument("--job_type", help="Job type", required=True) parser.add_argument("--master_addr", help="Master ip:port", required=True) parser.add_argument( "--embedding_service_endpoint", type=str, default="{}", help="The endpoint of embedding service, " "e.g. \"{'ip_0': [port_0,port_1]}\"", ) args, _ = parser.parse_known_args(args=worker_args) print_args(args, groups=ALL_ARGS_GROUPS) logger.warning("Unknown arguments: %s", _) return args
def _run_shell_command(self, command): retry_times = 0 while retry_times <= Redis.MAX_COMMAND_RETRY_TIMES: if retry_times: logger.warning( 'Command: "%s" failed to run, retry times: %d .' % (command, retry_times)) redis_process = subprocess.Popen( [command], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) redis_process.wait() if not redis_process.returncode: break redis_process.kill() # Wait for retry time.sleep(1) retry_times += 1 return redis_process.returncode
def report(self, task_id, success): """Report if the task is successful or not""" evaluation_task_completed = False with self._lock: _, task = self._doing.pop(task_id, (-1, None)) if not task: logger.warning("Unknown task_id: %d" % task_id) elif not success: # TODO: keep count of retries. self._todo.append(task) elif (task.type == elasticdl_pb2.EVALUATION and self._evaluation_service is not None): evaluation_task_completed = True else: logger.info( "Task:%d completed, %d remaining tasks", task_id, len(self._todo) + len(self._doing), ) if evaluation_task_completed: self._evaluation_service.complete_task()
def read_batch(self, start, end, columns=None, max_retries=3): """ Read ODPS table in chosen row range [ `start`, `end` ) with the specified columns `columns`. Args: start: The row index to start reading. end: The row index to end reading. columns: The list of column to read. max_retries : The maximum number of retries in case of exceptions. Returns: Two-dimension python list with shape: (end - start, len(columns)) """ retry_count = 0 if columns is None: columns = self._odps_table.schema.names while retry_count < max_retries: try: batch_record = [] with self._odps_table.open_reader(partition=self._partition, reopen=True) as reader: for record in reader.read(start=start, count=end - start, columns=columns): batch_record.append( [record[column] for column in columns]) return batch_record except Exception as e: if retry_count >= max_retries: raise Exception("Exceeded maximum number of retries") logger.warning("ODPS read exception {} for {} in {}." "Retrying time: {}".format( e, columns, self._table, retry_count)) time.sleep(5) retry_count += 1
def ReportGradient(self, request, _): model_version_valid = self._validate_model_version( request.model_version ) res = elasticdl_pb2.ReportGradientResponse() if not model_version_valid: logger.warning( "Task result for outdated version %d dropped", request.model_version, ) res.accepted = False res.model_version = self._version return res # TODO: Update task queue with task_id with self._lock: tmp = {} indexed_grads = {} edl_embedding_gradients = {} # Do sanity check before accumulating gradients. for k, v in request.gradient.items(): if k not in self._model: if v.indices: # grads of ElasticDL Embedding layer # TODO: check arr.shape[1] = embedding_dim of this # EdlEmbedding layer arr = tensor_to_ndarray(v) edl_embedding_gradients[k] = arr continue else: raise ValueError( "Gradient key: %s is not part of model", k ) arr = tensor_to_ndarray(v) if isinstance(arr, tf.IndexedSlices): if arr.values.shape[1] != self._model[k].numpy().shape[1]: raise ValueError( "Gradient key: %s has incompatible " "indexed slice dimension %d, expected %d" % ( k, arr.values.shape[1], self._model[k].numpy().shape[1], ) ) max_index = tf.math.reduce_max(arr.indices).numpy() if max_index >= self._model[k].numpy().shape[0]: raise ValueError( "Gradient key: %s has wrong indices %d, " "out of range %d" % ( k, max_index, self._model[k].numpy().shape[0] - 1, ) ) indexed_grads[k] = arr else: if arr.shape != self._model[k].numpy().shape: raise ValueError( "Gradient key: %s has incompatible dimension", k ) tmp[k] = arr # grads of ElasticDL Embedding layer for k, v in edl_embedding_gradients.items(): if k in self._edl_embedding_gradients: self._edl_embedding_gradients[k] = merge_indexed_slices( self._edl_embedding_gradients[k], v ) else: self._edl_embedding_gradients[k] = v # grads of Keras Embedding layer for k, v in indexed_grads.items(): if k not in self._gradient_sum_indexed: self._gradient_sum_indexed[k] = v else: grads_s = self._gradient_sum_indexed[k] self._gradient_sum_indexed[k] = merge_indexed_slices( grads_s, v ) # other grads for k, v in tmp.items(): if not self._use_async and k in self._gradient_sum: self._gradient_sum[k] = self._gradient_sum[k] + v else: self._gradient_sum[k] = v self._grad_n += 1 if self._use_async or self._grad_n >= self._grad_to_wait: self._update_model() self._update_evaluation() self._update_checkpoint() res.accepted = True res.model_version = self._version return res
def ReportGradient(self, request, _): model_version_valid = self._use_async or self._validate_model_version( request.model_version ) res = elasticdl_pb2.ReportGradientResponse() if not model_version_valid: logger.warning( "Task result for outdated version %d dropped", request.model_version, ) res.accepted = False res.model_version = self._version return res tmp = {} indexed_grads = {} edl_embedding_gradients = {} # Do sanity check before accumulating gradients. for k, v in request.gradient.items(): if k not in self._model: if v.indices: # grads of ElasticDL Embedding layer # TODO: check arr.shape[1] = embedding_dim of this # EdlEmbedding layer arr = tensor_to_ndarray(v) edl_embedding_gradients[k] = arr continue else: raise ValueError( "Gradient key: %s is not part of model", k ) arr = tensor_to_ndarray(v) if isinstance(arr, tf.IndexedSlices): if arr.values.shape[1] != self._model[k].numpy().shape[1]: raise ValueError( "Gradient key: %s has incompatible " "indexed slice dimension %d, expected %d" % ( k, arr.values.shape[1], self._model[k].numpy().shape[1], ) ) max_index = tf.math.reduce_max(arr.indices).numpy() if max_index >= self._model[k].numpy().shape[0]: raise ValueError( "Gradient key: %s has wrong indices %d, " "out of range %d" % (k, max_index, self._model[k].numpy().shape[0] - 1) ) indexed_grads[k] = arr else: if arr.shape != self._model[k].numpy().shape: raise ValueError( "Gradient key: %s has incompatible dimension", k ) tmp[k] = arr if not self._use_async: self._lock.acquire() self._process_gradients( edl_embedding_gradients, indexed_grads, tmp, request.model_version ) if not self._use_async: self._lock.release() res.accepted = True res.model_version = self._version return res
def _read_odps_one_shot( project, access_id, access_key, endpoint, table, partition, start, end, columns, max_retries=3, ): """ Read ODPS table in chosen row range [ `start`, `end` ) with the specified columns `columns`. Args: project: The ODPS project. access_id: The ODPS user access ID. access_key: The ODPS user access key. endpoint: The ODPS cluster endpoint. table: The ODPS table name. partition: The ODPS table's partition. Default is `None` if the table is not partitioned. start: The row index to start reading. end: The row index to end reading. columns: The list of column to read. max_retries : The maximum number of retries in case of exceptions. Returns: Two-dimension python list with shape: (end - start, len(columns)) """ odps_table = ODPS(access_id, access_key, project, endpoint).get_table( table ) retry_count = 0 while retry_count < max_retries: try: batch_record = [] with odps_table.open_reader( partition=partition, reopen=True ) as reader: for record in reader.read( start=start, count=end - start, columns=columns ): batch_record.append([record[column] for column in columns]) return batch_record except Exception as e: import time if retry_count >= max_retries: raise logger.warning( "ODPS read exception {} for {} in {}. retrying {} time".format( e, columns, table, retry_count ) ) time.sleep(5) retry_count += 1