def test_find_layer(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" model = _create_model_instance(model_def) layer_num = { tf.keras.layers.Conv2D: 2, tf.keras.layers.Dropout: 1, tf.keras.layers.Embedding: 0, } for layer_class in layer_num: layers = find_layer(model, layer_class) self.assertEqual(layer_num[layer_class], len(layers))
def test_find_layer_nested(self): model_def = "resnet50_subclass.resnet50_subclass.CustomModel" model = _create_model_instance(model_def) layer_num = { tf.keras.layers.Conv2D: 53, tf.keras.layers.Activation: 50, tf.keras.layers.Embedding: 0, } for layer_class in layer_num: layers = find_layer(model, layer_class) self.assertEqual(layer_num[layer_class], len(layers))
def _test_correctness(self, optimizer_class, X, Y, seed, **opt_kwargs): """Test the correctness of specific TensorFlow optimizer.""" _model_file = get_module_file_path( os.path.dirname(os.path.realpath(__file__)), "embedding_test_module.KerasEmbeddingModel", ) model_module = load_module(_model_file).__dict__ # train model with TensorFlow optimizer dim = 4 weights = self._random_init_model_weight([(4, dim), (4, dim), (72, 1), (1, )], seed) loss_fn = model_module["loss"] model1 = model_module["KerasEmbeddingModel"](4, dim, weights) opt1 = optimizer_class(**opt_kwargs) _train(model1, opt1, X, Y, loss_fn, random_seed=seed) model2 = model_module["EdlEmbeddingModel"](dim, weights[2:]) opt2 = optimizer_class(**opt_kwargs) layer_names = [layer.name for layer in find_layer(model2, Embedding)] # create Parameters object and initialize embedding vectors params = Parameters() for layer_name, embed_value in zip(layer_names, weights[:2]): embed_table = EmbeddingTable(layer_name, dim) embed_table.set(range(len(embed_value)), embed_value) params.embedding_params[layer_name] = embed_table _train_edl_embedding_with_optimizer_wrapper(model2, opt2, X, Y, loss_fn, params, random_seed=seed) # compare trained parameters wrong_msg = ( "The updated parameters of Optimizer Wrapper and TensorFlow " "optimizer %s differ." % opt1.get_config()["name"]) for layer1, layer2 in zip(model1.layers, model2.layers): if "embedding" in layer2.name: w1 = layer1.weights[0].numpy() w2 = params.get_embedding_param(layer2.name, range(4)) self.assertTrue(np.isclose(w1, w2).all(), msg=wrong_msg) else: for w1, w2 in zip(layer1.weights, layer2.weights): self.assertTrue(np.isclose(w1.numpy(), w2.numpy()).all(), msg=wrong_msg)
def main(): args = parse_master_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) records_per_task = args.minibatch_size * args.num_minibatches_per_task task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def) ).__dict__ model_inst = load_model_from_module( args.model_def, model_module, args.model_params ) optimizer = model_module[args.optimizer]() if all( ( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, ) ): job_type = JobType.TRAINING_WITH_EVALUATION elif all( ( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, ) ): job_type = JobType.EVALUATION_ONLY elif all( ( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, ) ): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if ( job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY ): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info( "Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint) ) embedding_dims = dict( [(layer.name, layer.output_dim) for layer in layers] ) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--master_addr", master_addr, "--job_type", job_type, "--embedding_service_endpoint", str(embedding_service_endpoint), ] worker_args.extend(build_arguments_from_parsed_result(args)) env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running..." ) while True: if tb_service.is_active(): time.sleep(10) else: logger.warning( "Unable to keep TensorBoard running. " "It has already terminated" ) break logger.info("Master stopped")
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y, loss_fn, params, random_seed): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) opt_wrapper = OptimizerWrapper( opt_keras, lookup_embedding_func=params.get_embedding_param, update_embedding_func=params.set_embedding_param, ) embed_layers = find_layer(model, Embedding) # initialize slot params params.create_slot_params(opt_wrapper.allowed_slot_names, opt_wrapper.slot_initial_value) # initialize ElasticDL embedding layer for layer in embed_layers: layer.set_lookup_embedding_func(params.get_embedding_param) # training process for train_iter, (features, labels) in enumerate(zip(X, Y)): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # Need to get non-embedding variables inside for loop because model # creates variables after the first time `model.call` is called if not train_iter: non_embed_vars = get_non_embedding_trainable_vars( model, embed_layers) embed_items = [] for layer in embed_layers: embed_items.extend([(bet, layer.name, ids) for bet, ids in layer.embedding_and_ids]) grads = tape.gradient( loss, non_embed_vars + [var for var, _, _ in embed_items]) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name non_embed_vars_n = len(non_embed_vars) non_embed_grads = grads[:non_embed_vars_n] embed_grads_dict = {} for (_, layer_name, ids), grad in zip(embed_items, grads[non_embed_vars_n:]): if layer_name in embed_grads_dict: merged_grads = embed_grads_dict[layer_name] embed_grads_dict[layer_name] = tf.IndexedSlices( tf.concat([merged_grads.values, grad.values], axis=0), tf.concat([merged_grads.indices, ids], axis=0), ) else: embed_grads_dict[layer_name] = tf.IndexedSlices( grad.values, ids) opt_wrapper.apply_gradients( list(zip(non_embed_grads, non_embed_vars)) + [(grad, layer_name) for layer_name, grad in embed_grads_dict.items()]) for layer in embed_layers: layer.reset()