def _init_embedding_layer(self): """ Init elasticdl.layers.embedding layer list and assign worker to them """ self._embedding_layers = find_layer(self._model, Embedding) for layer in self._embedding_layers: layer.set_endpoint(self._embedding_service_endpoint)
def _test_correctness(self, optimizer_class, X, Y, seed, **kwargs): """Test the correctness of specific TensorFlow optimizer.""" _model_file = get_module_file_path( os.path.dirname(os.path.realpath(__file__)), "embedding_test_module.KerasEmbeddingModel", ) model_module = load_module(_model_file).__dict__ # train model with TensorFlow optimizer weights = self._random_init_model_weight( [(4, 4), (4, 4), (72, 1), (1,)], seed ) loss_fn = model_module["loss"] model1 = model_module["KerasEmbeddingModel"](4, 4, weights) opt1 = optimizer_class(**kwargs) _train(model1, opt1, X, Y, loss_fn, random_seed=seed) model2 = model_module["EdlEmbeddingModel"](4, weights[2:]) opt2 = optimizer_class(**kwargs) layer_names = [layer.name for layer in find_layer(model2, Embedding)] embed_dims = dict([(layer_name, 4) for layer_name in layer_names]) # intialize embedding vectors in kv store mock_kv_store = MockKvStore({}) for layer, embed_table in zip(layer_names, weights[:2]): for i, embed_vector in enumerate(embed_table): mock_kv_store.update(["%s-%d" % (layer, i)], [embed_vector]) # train model with optimizer wrapper with mock.patch.object( EmbeddingService, "lookup_embedding", mock_kv_store.lookup ), mock.patch.object( EmbeddingService, "update_embedding", mock_kv_store.update ): _train_edl_embedding_with_optimizer_wrapper( model2, opt2, X, Y, loss_fn, embed_dims, random_seed=seed ) # compare trained parameters wrong_msg = ( "The updated parameters of Optimizer Wrapper and TensorFlow " "optimizer %s differ." % opt1.get_config()["name"] ) for layer1, layer2 in zip(model1.layers, model2.layers): if "embedding" in layer2.name: w1 = layer1.weights[0].numpy() keys = [Embedding.get_key([layer2.name, i]) for i in range(4)] w2 = np.concatenate(mock_kv_store.lookup(keys)[0]).reshape( 4, -1 ) self.assertTrue((w1 - w2 < 0.0001).all(), msg=wrong_msg) else: for w1, w2 in zip(layer1.weights, layer2.weights): self.assertTrue( (w1 - w2 < 0.0001).numpy().all(), msg=wrong_msg )
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y, loss_fn, embed_dims, random_seed): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) optimizer = OptimizerWrapper(opt_keras, None, embed_dims) # initialization process related to embedding layer and optimizer wrapper embed_layers = find_layer(model, Embedding) # training process for train_iter, (features, labels) in enumerate(zip(X, Y)): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # Need to get non-embedding variables inside for loop because model # creates variables after the first time `model.call` is called if not train_iter: non_embed_vars = get_non_embedding_trainable_vars( model, embed_layers) embed_items = [] for layer in embed_layers: embed_items.extend([(bet, layer.name, ids) for bet, ids in layer.bet_ids_pair]) grads = tape.gradient( loss, non_embed_vars + [var for var, _, _ in embed_items]) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name non_embed_vars_n = len(non_embed_vars) non_embed_grads = grads[:non_embed_vars_n] embed_grads_dict = {} for (_, layer_name, ids), grad in zip(embed_items, grads[non_embed_vars_n:]): if layer_name in embed_grads_dict: merged_grads = embed_grads_dict[layer_name] embed_grads_dict[layer_name] = tf.IndexedSlices( tf.concat([merged_grads.values, grad.values], axis=0), tf.concat([merged_grads.indices, ids], axis=0), ) else: embed_grads_dict[layer_name] = tf.IndexedSlices( grad.values, ids) optimizer.apply_gradients( list(zip(non_embed_grads, non_embed_vars)) + [(grad, layer_name) for layer_name, grad in embed_grads_dict.items()]) for layer in embed_layers: layer.reset()
def _init_embedding_layer(self): """ Init elasticdl.layers.embedding layer list and assign worker to them """ self._embedding_layers = find_layer(self._model, Embedding) for layer in self._embedding_layers: layer.set_lookup_func(self.lookup_embedding) if self._embedding_layers: # TODO check that Redis IP/PORT is set pass
def test_find_layer_nested(self): model_def = "resnet50_subclass.resnet50_subclass.CustomModel" model = _create_model_instance(model_def) layer_num = { tf.keras.layers.Conv2D: 53, tf.keras.layers.Activation: 50, tf.keras.layers.Embedding: 0, } for layer_class in layer_num: layers = find_layer(model, layer_class) self.assertEqual(layer_num[layer_class], len(layers))
def test_find_layer(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" model = _create_model_instance(model_def) layer_num = { tf.keras.layers.Conv2D: 2, tf.keras.layers.Dropout: 1, tf.keras.layers.Embedding: 0, } for layer_class in layer_num: layers = find_layer(model, layer_class) self.assertEqual(layer_num[layer_class], len(layers))
def main(): args = parse_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, args.records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ model_inst = load_model_from_module(args.model_def, model_module, args.model_params) optimizer = model_module[args.optimizer]() if all(( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, )): job_type = JobType.TRAINING_WITH_EVALUATION elif all(( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, )): job_type = JobType.EVALUATION_ONLY elif all(( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, )): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if (job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info("Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint)) embedding_dims = dict([(layer.name, layer.output_dim) for layer in layers]) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--model_zoo", args.model_zoo, "--master_addr", master_addr, "--log_level", args.log_level, "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--job_type", job_type, "--minibatch_size", str(args.minibatch_size), "--embedding_service_endpoint", str(embedding_service_endpoint), "--get_model_steps", str(args.get_model_steps), ] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running...") while True: if tb_service.is_active(): time.sleep(10) else: logger.warning("Unable to keep TensorBoard running. " "It has already terminated") break logger.info("Master stopped")
def _train_edl_embedding_with_optimizer_wrapper( model, opt_keras, X, Y, loss_fn, embed_dims, random_seed ): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) optimizer = OptimizerWrapper(opt_keras, None, embed_dims) # initialization process related to embedding layer and optimizer wrapper embed_layers = find_layer(model, Embedding) def lookup_func(ids, layer_name, initializer, output_dim): values, unknown = EmbeddingService.lookup_embedding( [Embedding.get_key([layer_name, i]) for i in ids] ) return np.concatenate(values).reshape(len(ids), -1) for layer in embed_layers: layer.set_lookup_func(lookup_func) # training process for features, labels in zip(X, Y): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # TODO: calculate train_vars_embed and train_vars_other can be a # reusable function train_vars_embed = [] train_vars_other = [] for layer in model.layers: if isinstance(layer, Embedding): for bet, ids in layer.bet_ids_pair: train_vars_embed.append((bet, layer.name, ids)) else: vars = layer.trainable_variables train_vars_other.extend(vars) grads = tape.gradient( loss, train_vars_other + [var for var, _, _ in train_vars_embed] ) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name train_vars_other_len = len(train_vars_other) grads_new = grads[:train_vars_other_len] grads_embed_dict = {} for (_, layer_name, ids), grad in zip( train_vars_embed, grads[train_vars_other_len:] ): if layer_name in grads_embed_dict: grads_merged = grads_embed_dict[layer_name] grads_embed_dict[layer_name] = tf.IndexedSlices( tf.concat([grads_merged.values, grad.values], axis=0), tf.concat([grads_merged.indices, ids], axis=0), ) else: grads_embed_dict[layer_name] = tf.IndexedSlices( grad.values, ids ) optimizer.apply_gradients( list(zip(grads_new, train_vars_other)) + [ (grad, layer_name) for layer_name, grad in grads_embed_dict.items() ] ) for layer in embed_layers: layer.reset()