def _create_master_and_worker(self, service_endpoint=None, embedding_dims={}): model_inst = custom_model() master = MasterServicer( 2, 2, tf.optimizers.SGD(0.1), None, init_var=model_inst.trainable_variables, embedding_service_endpoint=service_endpoint, embedding_dims=embedding_dims, checkpoint_filename_for_init=None, checkpoint_service=None, evaluation_service=None, ) arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", 2, "--model_zoo", _model_zoo_path, "--model_def", "test_module.custom_model", ] args = parse_worker_args(arguments) worker = Worker(args) worker.set_model(model_inst) worker._stub = InProcessMaster(master) return master, worker
def main(): args = parse_worker_args() channel = grpc.insecure_channel( args.master_addr, options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) logger = log_util.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) worker = Worker( args.worker_id, args.job_type, args.minibatch_size, args.model_zoo, channel=channel, embedding_service_endpoint=eval(args.embedding_service_endpoint), dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_def=args.model_def, model_params=args.model_params, get_model_steps=args.get_model_steps, ) worker.run()
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) if args.master_addr is None: raise ValueError("master_addr is missing for worker") master_channel = build_channel(args.master_addr) ps_channels = [] if args.ps_addrs: ps_addrs = args.ps_addrs.split(",") for addr in ps_addrs: # addr is in the form as "ps-pod-name.namespace.svc:port" channel = build_channel(addr) # Wait the channel is ready by a Future object. grpc.channel_ready_future(channel).result() logger.info("grpc channel %s to connect pod %s is ready" % (addr, addr.split(".")[0])) ps_channels.append(channel) worker = Worker(args, channel=master_channel, ps_channels=ps_channels) worker.run()
def main(): args = parse_worker_args() if args.master_addr is None: raise ValueError("master_addr is missing for worker") channel = grpc.insecure_channel( args.master_addr, options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) # TODO, create PS channels here ps_addrs = args.ps_addrs.split(",") # Just print ps_addrs out to avoid flake8 failure # This print can be removed once we initialize ps_channels # by using ps_addrs print("Parameter server addresses are %s" % ps_addrs) ps_channels = None logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) worker = Worker(args, channel=channel, ps_channels=ps_channels) worker.run()
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) if args.master_addr is None: raise ValueError("master_addr is missing for worker") master_channel = build_channel(args.master_addr) ps_channels = [] if args.ps_addrs: # TODO: use ps_addrs from master directly after ps service is working. # Get ps pod ip for ps grpc connection for now. ps_addrs = args.ps_addrs.split(",") config.load_incluster_config() api = client.CoreV1Api() for addr in ps_addrs: # addr is in the form as "ps-pod-name.namespace.svc:port" addr_splitted = addr.split(".") while True: pod = api.read_namespaced_pod( namespace=addr_splitted[1], name=addr_splitted[0] ) if pod.status.pod_ip: break # If ps pod is not ready yet, sleep 2 seconds and try again. time.sleep(2) addr = pod.status.pod_ip + ":" + addr.split(":")[-1] channel = grpc.insecure_channel( addr, options=[ ( "grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH, ), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) # Wait the channel is ready by a Future object. grpc.channel_ready_future(channel).result() logger.info( "grpc channel %s to connect pod %s is ready" % (addr, pod.metadata.name) ) ps_channels.append(channel) worker = Worker(args, channel=master_channel, ps_channels=ps_channels) worker.run()
def test_restart_ps(self): model_def = "mnist.mnist_functional_api.custom_model" num_data = 8 training_data = [ get_random_batch(self._batch_size) for _ in range(num_data) ] workers = [] self._create_pserver(model_def, 2) for w in range(2): self._reset_pserver() arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", model_def, "--distribution_strategy", DistributionStrategy.PARAMETER_SERVER, ] args = parse_worker_args(arguments) tf.keras.backend.clear_session() tf.random.set_seed(22) worker = Worker(args, ps_client=PSClient(self._channels)) workers.append(worker) worker._trainer._run_model_call_before_training( training_data[0][0]) for i in range(num_data): worker._trainer._get_model() w_loss, w_grads = worker._trainer._training_process_eagerly( training_data[i][0], training_data[i][1]) worker._trainer._report_gradient(w_grads) if w == 1 and i == 3: # Restart ps for the 2nd worker at i==3 # self._restart_pserver(model_def) self._reset_pserver() # `push_dense_parameters` will be called in `get_model` to # initialize variables on ps with worker variables worker._trainer._get_model() # send the grads again as these grads are not applied # on worker variables worker._trainer._report_gradient(w_grads) for var_name in workers[0]._trainer._non_embed_vars: np.testing.assert_array_equal( workers[0]._trainer._non_embed_vars[var_name].numpy(), workers[1]._trainer._non_embed_vars[var_name].numpy(), ) self._close_channels()
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) if args.master_addr is None: raise ValueError("master_addr is missing for worker") master_channel = grpc.insecure_channel( args.master_addr, options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) ps_channels = [] if args.ps_addrs: # TODO: use ps_addrs from master directly after ps service is working. # Get ps pod ip for ps grpc connection for now. ps_addrs = args.ps_addrs.split(",") from kubernetes import client, config config.load_incluster_config() api = client.CoreV1Api() for addr in ps_addrs: # addr is in the form as "ps-pod-name.namespace.svc:port" addr_splitted = addr.split(".") pod = api.read_namespaced_pod(namespace=addr_splitted[1], name=addr_splitted[0]) addr = pod.status.pod_ip + ":" + addr.split(":")[-1] channel = grpc.insecure_channel( addr, options=[ ( "grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH, ), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) ps_channels.append(channel) worker = Worker(args, channel=master_channel, ps_channels=ps_channels) worker.run()
def test_worker_pull_embedding(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" self._create_pserver(model_def, 2) arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", model_def, "--distribution_strategy", DistributionStrategy.PARAMETER_SERVER, ] args = parse_worker_args(arguments) worker = Worker(args, ps_channels=self._channels) # Test lookup embedding vectors that do not exist layers = ["test-2", "test-2-slot"] ids = [3, 5, 1, 6, 10, 2, 1, 2, 4, 7, 9] embedding_table_args = [ (layers[0], 8, "uniform", False), (layers[1], 8, 3.3, True), ] # initialize embedding table object for pserver in self._pservers: for layer, table_args in zip(layers, embedding_table_args): pserver.parameters.embedding_params[layer] = EmbeddingTable( *table_args ) result_dict = {} for layer in layers: embedding = worker.pull_embedding_vectors(layer, ids) result_dict[layer] = embedding for layer in layers: expected_result = [] for embedding_id in ids: ps_id = int_to_id(embedding_id, len(self._pservers)) table = self._pservers[ps_id].parameters.embedding_params[ layer ] expected_result.append(table.get([embedding_id])) expected_result = np.concatenate(expected_result) self.assertTrue(np.allclose(expected_result, result_dict[layer]))
def test_restart_ps(self): num_data = 8 training_data = [ random_batch(self._batch_size) for _ in range(num_data) ] workers = [] for w in range(2): self._restart_pserver() tf.keras.backend.clear_session() tf.random.set_seed(22) arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", self._model_def, "--distribution_strategy", "ParameterServerStrategy", ] args = parse_worker_args(arguments) worker = Worker(args, ps_channels=self._channel) workers.append(worker) worker._run_model_call_before_training(training_data[0][0]) for i in range(num_data): worker.get_model(0, elasticdl_pb2.MINIMUM) w_loss, w_grads = worker.training_process_eagerly( training_data[i][0], training_data[i][1] ) worker.report_gradient(w_grads) if w == 1 and i == 3: # Restart ps for the 2nd worker at i==3 self._restart_pserver() # `report_variable` will be called in `get_model` to # initialize variables on ps with worker variables worker.get_model(0, elasticdl_pb2.MINIMUM) # send the grads again as these grads are not applied # on worker variables worker.report_gradient(w_grads) for var_name in workers[0]._non_embed_vars: np.testing.assert_array_equal( workers[0]._non_embed_vars[var_name].numpy(), workers[1]._non_embed_vars[var_name].numpy(), )
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) if args.master_addr is None: raise ValueError("master_addr is missing for worker") master_channel = build_channel(args.master_addr) ps_channels = [] if args.ps_addrs: ps_addrs = args.ps_addrs.split(",") for addr in ps_addrs: # addr is in the form as "ps-pod-name.namespace.svc:port" channel = build_channel(addr) succeeded = False for i in range(CONNECT_PS_MAX_RETRIES): try: grpc.channel_ready_future(channel).result( timeout=CONNECT_PS_TIMEOUT) logger.info("grpc channel %s to connect pod %s is ready" % (addr, addr.split(".")[0])) ps_channels.append(channel) succeeded = True break except grpc.FutureTimeoutError: logger.warning("Failed to connect pod %s with %d retry" % (addr.split(".")[0], i)) if not succeeded: raise TimeoutError( "Time out to connect pod %s with 3 retries" % addr.split(".")[0]) if args.distribution_strategy == DistributionStrategy.ALLREDUCE: logger.info("Wait for %s seconds for FTLib consensus service to " "detect the worker pod" % str(_ALLREDUCE_STRATEGY_WARM_UP_SECS)) time.sleep(_ALLREDUCE_STRATEGY_WARM_UP_SECS) worker = Worker( args, channel=master_channel, ps_channels=ps_channels, set_parallelism=True, ) worker.run()
def test_embedding_layer(self): arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", 32, "--model_zoo", _model_zoo_path, "--model_def", "embedding_test_module.EdlEmbeddingModel", ] args = parse_worker_args(arguments) worker = Worker(args) self.assertTrue(len(worker._embedding_layers) == 2)
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) logger.info("Starting worker %d", args.worker_id) if args.master_addr is None: raise ValueError("master_addr is missing for worker") master_client = MasterClient(build_channel(args.master_addr), args.worker_id) ps_client = None if (args.distribution_strategy == DistributionStrategy.PARAMETER_SERVER and args.ps_addrs): ps_channels = [] ps_addrs = args.ps_addrs.split(",") for addr in ps_addrs: # addr is in the form as "ps-pod-name.namespace.svc:port" channel = build_channel(addr) succeeded = False for i in range(CONNECT_PS_MAX_RETRIES): try: grpc.channel_ready_future(channel).result( timeout=CONNECT_PS_TIMEOUT) logger.info("grpc channel %s to connect pod %s is ready" % (addr, addr.split(".")[0])) ps_channels.append(channel) succeeded = True break except grpc.FutureTimeoutError: logger.warning("Failed to connect pod %s with %d retry" % (addr.split(".")[0], i)) if not succeeded: raise TimeoutError( "Time out to connect pod %s with 3 retries" % addr.split(".")[0]) ps_client = PSClient(ps_channels) worker = Worker( args, master_client=master_client, ps_client=ps_client, set_parallelism=True, ) worker.run()
def _create_worker(self, worker_num): for i in range(worker_num): tf.keras.backend.clear_session() tf.random.set_seed(22) arguments = [ "--job_type", elasticai_api_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", self._model_def, "--distribution_strategy", DistributionStrategy.PARAMETER_SERVER, ] args = parse_worker_args(arguments) worker = Worker(args, ps_client=PSClient(self._channels)) self._workers.append(worker)
def _create_worker(self, worker_num, max_allreduce_retry_num=0): for i in range(worker_num): arguments = [ "--worker_id", i, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", self._model_def, "--distribution_strategy", DistributionStrategy.ALLREDUCE, ] args = parse_worker_args(arguments) worker = Worker(args, max_allreduce_retry_num=max_allreduce_retry_num) self._workers.append(worker)
def main(): args = parse_worker_args() logger = log_utils.get_logger(__name__) master_addr = args.master_addr worker_id = int(args.worker_id) logger.info("Starting worker %d", worker_id) master_client = MasterClient(build_channel(master_addr), worker_id) logger.info("Building PS connection....") ps_client = (build_ps_client(args.ps_addrs, logger) if args.distribution_strategy == DistributionStrategy.PARAMETER_SERVER else None) logger.info("Have builded PS.") worker = Worker( args, master_client=master_client, ps_client=ps_client, set_parallelism=True, ) worker.run()
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def _create_worker(self, arguments): tf.keras.backend.clear_session() tf.random.set_seed(22) args = parse_worker_args(arguments) return Worker(args)
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) def master_creator(): return MasterServicer( batch_size, evaluation_service=evaluation_service, master=master, ) svc, port = _server(master_creator) mc = MasterClient(build_channel("localhost:%d" % port), 1) worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels)) for pservicer in pservers: # FIXME(yancey1989): decouple pserver and master client pservicer._master_stub = mc worker.run() task = mc.get_task() # stop the master servicer svc.stop(0) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return task.model_version
def testMaxCheckpointVersions(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions") os.makedirs(chkp_dir) # Save checkpoints every 2 steps, and keep 5 checkpoints at most checkpointer = CheckpointService(chkp_dir, 2, 5, False) self.assertTrue(checkpointer.is_enabled()) batch_size = 2 # Launch the training arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", batch_size, "--model_zoo", _model_zoo_path, "--model_def", "test_module.custom_model", ] args = parse_worker_args(arguments) worker = Worker(args) filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1) task_d = _TaskDispatcher({filename: (0, 128)}, {}, {}, records_per_task=64, num_epochs=1) master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=worker._model.trainable_variables, checkpoint_filename_for_init="", checkpoint_service=checkpointer, evaluation_service=None, ) worker._stub = InProcessMaster(master) worker.run() # We should have 5 checkpoints when the training finishes checkpoint_files = sorted(os.listdir(checkpointer._directory)) self.assertEqual( checkpoint_files, [ "model_v24.chkpt", "model_v26.chkpt", "model_v28.chkpt", "model_v30.chkpt", "model_v32.chkpt", ], ) # Latest version should be 32 self.assertEqual(32, checkpointer.get_latest_checkpoint_version()) # Check all checkpoints for version in [24, 26, 28, 30, 32]: model = checkpointer.get_checkpoint_model(version) self.assertEqual(version, model.version) # Checkpoint not found self.assertRaisesRegex( RuntimeError, "Failed to read model checkpoint from file", checkpointer.get_checkpoint_model, 100, )
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ for channel in ps_channels: grpc.channel_ready_future(channel).result() worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) worker = Worker(args, ps_channels=ps_channels) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( batch_size, task_d, evaluation_service=evaluation_service, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] in_process_master = InProcessMaster(master, callbacks) worker._stub = in_process_master for pservicer in pservers: pservicer._master_stub = in_process_master worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.get_task(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def _worker_train(self, train_db, test_db, dataset, stop_step): if dataset == "mnist": model_def = ( "mnist_functional_api.mnist_functional_api.custom_model" ) elif dataset == "frappe": model_def = ( "deepfm_functional_api.deepfm_functional_api.custom_model" ) else: raise ValueError("dataset %s is not supported", dataset) arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", model_def, "--distribution_strategy", "ParameterServerStrategy", ] args = parse_worker_args(arguments) worker = Worker(args, ps_channels=self._channel) acc_meter = tf.keras.metrics.Accuracy() worker_results = [] for step, (x, y) in enumerate(train_db): if step == 0: worker._run_model_call_before_training(x) worker.get_model(step, elasticdl_pb2.MINIMUM) w_loss, w_grads = worker.training_process_eagerly(x, y) worker.report_gradient(w_grads) if step % 20 == 0: worker.get_model(step, elasticdl_pb2.MINIMUM) for (x, y) in test_db: out = worker.forward_process(x) if dataset == "mnist": acc_meter.update_state(tf.argmax(out, axis=1), y) else: out["probs"] = tf.reshape(out["probs"], [-1]) acc_meter.update_state( tf.where( out["probs"] < 0.5, x=tf.zeros_like(y), y=tf.ones_like(y), ), y, ) worker_results.append( (float(w_loss.numpy()), float(acc_meter.result().numpy())) ) acc_meter.reset_states() if step > stop_step: break return worker_results
def test_compare_onebatch_train(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" self._create_pserver(model_def, 2) images, labels = get_random_batch(self._batch_size) # TODO(yunjian.lmh): test optimizer wrapper arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", model_def, "--distribution_strategy", DistributionStrategy.PARAMETER_SERVER, ] args = parse_worker_args(arguments) tf.keras.backend.clear_session() tf.random.set_seed(22) worker = Worker(args, ps_channels=self._channels) worker._run_model_call_before_training(images) worker.get_model() w_loss, w_grads = worker.training_process_eagerly(images, labels) worker.report_gradient(w_grads) tf.keras.backend.clear_session() tf.random.set_seed(22) ( model, dataset_fn, loss_fn, opt_fn, eval_metrics_fn, prediction_outputs_processor, create_data_reader_fn, callback_list, ) = get_model_spec( model_zoo=self._model_zoo_path, model_def=model_def, dataset_fn="dataset_fn", model_params=None, loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", prediction_outputs_processor="PredictionOutputsProcessor", custom_data_reader="custom_data_reader", callbacks="callbacks", ) with tf.GradientTape() as tape: output = model.call(images, training=True) labels = tf.reshape(labels, [-1]) loss = loss_fn(labels, output) grads = tape.gradient(loss, model.trainable_variables) opt_fn().apply_gradients(zip(grads, model.trainable_variables)) for v in model.trainable_variables: ps_id = string_to_id(v.name, len(self._channels)) ps_v = self._pservers[ps_id].parameters.get_non_embedding_param( v.name) np.testing.assert_array_equal(ps_v.numpy(), v.numpy())
def test_train_acceleration_with_embedding(self): kv_store = MockKvStore() model_inst = CustomModel() master = MasterServicer( 2, 2, tf.optimizers.SGD(0.1), None, init_var=model_inst.trainable_variables, checkpoint_filename_for_init=None, checkpoint_service=None, evaluation_service=None, ) arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", 32, "--model_zoo", _model_zoo_path, "--model_def", "embedding_test_module.EdlEmbeddingModel", ] args = parse_worker_args(arguments) worker = Worker(args) worker._stub = InProcessMaster(master) inputs_list = [ { "f1": tf.constant([[0], [1], [2]], tf.int64), "f2": tf.constant([[2], [1], [0]], tf.int64), }, { "f1": tf.constant([[3], [4], [3]], tf.int64), "f2": tf.constant([[2], [1], [0]], tf.int64), }, ] labels_list = [[0, 1, 0], [1, 1, 0]] input_dim = 5 embedding_dim = 16 worker.set_model(model_inst) # initialize kv store for layer in model_inst.layers: if isinstance(layer, Embedding): name = layer.name keys = [Embedding.get_key([name, i]) for i in range(input_dim)] values = [ np.random.rand(embedding_dim).astype(np.float32) for i in range(input_dim) ] kv_store.update(keys, values) with mock.patch.object( EmbeddingService, "lookup_embedding", kv_store.lookup ), mock.patch.object( EmbeddingService, "update_embedding", kv_store.update ): worker._init_embedding_layer() worker._run_model_call_before_training(inputs_list[0]) # run training process without tf.function correct_grads = [] correct_ids_list = [] for features, labels in zip(inputs_list, labels_list): loss, grads = worker.training_process_eagerly(features, labels) correct_grads.append(grads) ids = {} for layer in worker._embedding_layers: ids[layer.name] = layer.embedding_and_ids[0].batch_ids correct_ids_list.append(ids) worker._reset_embedding() # run training process with tf.function test_grads = [] test_ids_list = [] for features, labels in zip(inputs_list, labels_list): self.assertFalse(worker._train_eagerly) loss, grads = worker.training_process(features, labels) test_grads.append(grads) ids = {} for layer in worker._embedding_layers: ids[layer.name] = copy.deepcopy( layer.embedding_and_ids[0].batch_ids ) test_ids_list.append(ids) worker._reset_embedding() # compare the gradients for test_g, correct_g in zip(test_grads, correct_grads): for g1, g2 in zip(test_g, correct_g): if isinstance(g1, tf.IndexedSlices): self.assertTrue(np.isclose(g1.values, g2.values).all()) self.assertTrue(np.isclose(g1.indices, g2.indices).all()) else: self.assertTrue(np.isclose(g1, g2).all()) for test_ids, correct_ids in zip(correct_ids_list, test_ids_list): for layer_name in correct_ids.keys(): self.assertTrue( tf.equal(test_ids[layer_name], correct_ids[layer_name]) .numpy() .all() )