def test_get_model_spec(self): ( model, dataset_fn, loss, optimizer, eval_metrics_fn, prediction_outputs_processor, ) = get_model_spec( model_zoo=_model_zoo_path, model_def="test_module.custom_model", dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", model_params="", prediction_outputs_processor="PredictionOutputsProcessor", ) self.assertTrue(model is not None) self.assertTrue(dataset_fn is not None) self.assertTrue(loss is not None) self.assertTrue(optimizer is not None) self.assertTrue(eval_metrics_fn is not None) self.assertTrue(prediction_outputs_processor is not None)
def test_compare_mnist_train(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" self._create_pserver(model_def, 2) db, test_db = get_mnist_dataset(self._batch_size) stop_step = 20 self._create_worker(1) worker_results = self._worker_train(0, train_db=db, test_db=test_db, stop_step=stop_step) tf.keras.backend.clear_session() tf.random.set_seed(22) acc_meter = tf.keras.metrics.Accuracy() ( model, dataset_fn, loss_fn, opt_fn, eval_metrics_fn, prediction_outputs_processor, create_data_reader_fn, callbacks_list, ) = get_model_spec( model_zoo=self._model_zoo_path, model_def=model_def, dataset_fn="dataset_fn", model_params=None, loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", prediction_outputs_processor="PredictionOutputsProcessor", custom_data_reader="custom_data_reader", callbacks="callbacks", ) local_results = [] for step, (x, y) in enumerate(db): with tf.GradientTape() as tape: out = model.call(x, training=True) ll = loss_fn(y, out) grads = tape.gradient(ll, model.trainable_variables) opt_fn().apply_gradients(zip(grads, model.trainable_variables)) if step % 20 == 0: for (x, y) in test_db: out = model.call(x, training=False) acc_meter.update_state(tf.argmax(out, axis=1), y) local_results.append( (float(ll.numpy()), float(acc_meter.result().numpy()))) acc_meter.reset_states() if step > stop_step: break for w, l in zip(worker_results, local_results): self.assertTupleEqual(w, l)
def _init_model_from_args(self, args): """ Please refer to elastic/python/common/args.py for more details about arguments of a worker. """ self._log_loss_steps = args.log_loss_steps ( model_inst, self._feed, loss, opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, self._custom_data_reader, self._callbacks_list, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, feed=args.feed, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, prediction_outputs_processor=args.prediction_outputs_processor, custom_data_reader=args.custom_data_reader, callbacks=args.callbacks, ) self._model_handler = ModelHandler.get_model_handler( self._distribution_strategy, checkpoint_dir=args.checkpoint_dir) self._model_inst = self._model_handler.get_model_to_train(model_inst) self._model_inst.optimizer = opt_fn() self._model_inst.loss = loss self._model_version = -1 self._get_model_steps = args.get_model_steps
def _init_from_args(self, args): """ Please refer to elastic/python/common/args.py for more details about arguments of a worker. """ self._worker_id = args.worker_id self._job_type = args.job_type self._minibatch_size = args.minibatch_size ( model_inst, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, self._custom_data_reader, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_params=args.model_params, prediction_outputs_processor=args.prediction_outputs_processor, custom_data_reader=args.custom_data_reader, ) self._collective_communicator = ( CollectiveCommunicator() if self._distribution_strategy == DistributionStrategy.ALLREDUCE else None) self._model_handler = ModelHandler.get_model_handler( self._distribution_strategy, checkpoint_dir=args.checkpoint_dir) model_inst = self._model_handler.get_model_to_train(model_inst) self.set_model(model_inst) self._model_version = -1 self._model_versions_from_ps = [-1 for _ in range(self._ps_num)] self._task_data_service = TaskDataService( self, self._job_type == JobType.TRAINING_WITH_EVALUATION, data_reader_params=get_dict_from_params_str( args.data_reader_params), ) if self._dataset_fn is None: if hasattr(self._task_data_service.data_reader, "default_dataset_fn"): self._dataset_fn = ( self._task_data_service.data_reader.default_dataset_fn()) else: raise ValueError( "dataset_fn is required if the data_reader used does " "not provide default implementation of dataset_fn") self._get_model_steps = args.get_model_steps if self._get_model_steps > 1: self._opt = self._opt_fn() self._non_embed_grads = {} self._evaluation_result = {}
def _init_from_args(self, args): """ Please refer to elastic/python/common/args.py for more details about arguments of a worker. """ self._worker_id = args.worker_id self._job_type = args.job_type self._minibatch_size = args.minibatch_size ( model_inst, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_params=args.model_params, prediction_outputs_processor=args.prediction_outputs_processor, ) self._embedding_service_endpoint = eval( args.embedding_service_endpoint) self._distribution_strategy = args.distribution_strategy self._collective_communicator = ( CollectiveCommunicator() if self._distribution_strategy == DistributionStrategy.ALLREDUCE else None) self._model_handler = ModelHandler.get_model_handler( self._distribution_strategy, checkpoint_dir=args.checkpoint_dir) model_inst = self._model_handler.get_model_to_train(model_inst) self.set_model(model_inst) self._model_version = -1 self._task_data_service = TaskDataService( self, self._job_type == JobType.TRAINING_WITH_EVALUATION, data_reader_params=get_dict_from_params_str( args.data_reader_params), ) self._get_model_steps = args.get_model_steps if self._get_model_steps > 1: self._opt = self._opt_fn() self._non_embed_grads = None self._evaluation_result = {}
def __init__(self, args): envs = parse_envs(args.envs) self._init_environment(envs) ( self.model_inst, self.dataset_fn, self.loss_fn, self.opt_fn, self.eval_metrics_fn, self.prediction_outputs_processor, self.custom_data_reader, self.callback_list, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_params=args.model_params, prediction_outputs_processor="", custom_data_reader=args.custom_data_reader, callbacks=args.callbacks, ) self.opt = self.opt_fn() self.epoch = args.num_epochs self.evaluation_steps = args.evaluation_steps self.batch_size = args.minibatch_size self.data_reader_params = get_dict_from_params_str( args.data_reader_params ) self.records_per_task = ( args.minibatch_size * args.num_minibatches_per_task ) create_data_reader_fn = ( create_data_reader if self.custom_data_reader is None else self.custom_data_reader ) self.data_reader = create_data_reader_fn( data_origin=args.training_data, records_per_task=self.records_per_task, **self.data_reader_params ) self.training_data = args.training_data self.validation_data = args.validation_data self.save_model_dir = args.output
def test_get_model_spec(self): ( model, dataset_fn, loss, optimizer, eval_metrics_fn, prediction_outputs_processor, custom_data_reader, callback_list, ) = get_model_spec( model_zoo=_model_zoo_path, model_def="test_module.custom_model", dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", model_params="", prediction_outputs_processor="PredictionOutputsProcessor", custom_data_reader="custom_data_reader", callbacks="callbacks", ) self.assertTrue(model is not None) self.assertTrue(dataset_fn is not None) self.assertTrue(loss is not None) self.assertTrue(optimizer is not None) self.assertTrue(eval_metrics_fn is not None) self.assertTrue(prediction_outputs_processor is not None) self.assertTrue(custom_data_reader is not None) self.assertTrue(callback_list is not None) self.assertEqual(len(callback_list.callbacks), 1) self.assertRaisesRegex( Exception, "Cannot find the custom model function/class " "in model definition files", get_model_spec, model_zoo=_model_zoo_path, model_def="test_module.undefined", dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", model_params="", prediction_outputs_processor="PredictionOutputsProcessor", custom_data_reader="custom_data_reader", callbacks="callbacks", )
def test_compare_onebatch_train(self): model_def = "mnist_functional_api.mnist_functional_api.custom_model" self._create_pserver(model_def, 2) images, labels = get_random_batch(self._batch_size) # TODO(yunjian.lmh): test optimizer wrapper arguments = [ "--worker_id", 0, "--job_type", elasticdl_pb2.TRAINING, "--minibatch_size", self._batch_size, "--model_zoo", self._model_zoo_path, "--model_def", model_def, "--distribution_strategy", DistributionStrategy.PARAMETER_SERVER, ] args = parse_worker_args(arguments) tf.keras.backend.clear_session() tf.random.set_seed(22) worker = Worker(args, ps_channels=self._channels) worker._run_model_call_before_training(images) worker.get_model() w_loss, w_grads = worker.training_process_eagerly(images, labels) worker.report_gradient(w_grads) tf.keras.backend.clear_session() tf.random.set_seed(22) ( model, dataset_fn, loss_fn, opt_fn, eval_metrics_fn, prediction_outputs_processor, create_data_reader_fn, callback_list, ) = get_model_spec( model_zoo=self._model_zoo_path, model_def=model_def, dataset_fn="dataset_fn", model_params=None, loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", prediction_outputs_processor="PredictionOutputsProcessor", custom_data_reader="custom_data_reader", callbacks="callbacks", ) with tf.GradientTape() as tape: output = model.call(images, training=True) labels = tf.reshape(labels, [-1]) loss = loss_fn(labels, output) grads = tape.gradient(loss, model.trainable_variables) opt_fn().apply_gradients(zip(grads, model.trainable_variables)) for v in model.trainable_variables: ps_id = string_to_id(v.name, len(self._channels)) ps_v = self._pservers[ps_id].parameters.get_non_embedding_param( v.name) np.testing.assert_array_equal(ps_v.numpy(), v.numpy())
def _init_from_args(self, args): """ Please refer to elastic/python/common/args.py for more details about arguments of a worker. """ self._worker_id = args.worker_id self._job_type = args.job_type self._minibatch_size = args.minibatch_size self._log_loss_steps = args.log_loss_steps ( model_inst, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, self._custom_data_reader, self._callbacks_list, ) = get_model_spec( model_zoo=args.model_zoo, model_def=args.model_def, dataset_fn=args.dataset_fn, loss=args.loss, optimizer=args.optimizer, eval_metrics_fn=args.eval_metrics_fn, model_params=args.model_params, prediction_outputs_processor=args.prediction_outputs_processor, custom_data_reader=args.custom_data_reader, callbacks=args.callbacks, ) self._model_handler = ModelHandler.get_model_handler( self._distribution_strategy, checkpoint_dir=args.checkpoint_dir) model_inst = self._model_handler.get_model_to_train(model_inst) self.set_model(model_inst) self._model_version = -1 self._task_data_service = TaskDataService( self._mc, self._job_type == JobType.TRAINING_WITH_EVALUATION, custom_data_reader=self._custom_data_reader, data_reader_params=get_dict_from_params_str( args.data_reader_params), data_origin=args.training_data, ) if self._dataset_fn is None: if hasattr(self._task_data_service.data_reader, "default_dataset_fn"): self._dataset_fn = ( self._task_data_service.data_reader.default_dataset_fn()) else: raise ValueError( "dataset_fn is required if the data_reader used does " "not provide default implementation of dataset_fn") self._get_model_steps = args.get_model_steps self._opt = self._opt_fn() self._model.optimizer = self._opt self._non_embed_grads = {} self._evaluation_result = {} saved_model_exporter = SavedModelExporter(self._task_data_service, self._dataset_fn, self._model_handler) # Place default callbacks at the head to execute them firstly self._callbacks_list.callbacks.insert(0, saved_model_exporter) self._callbacks_list.set_model(model_inst) set_callback_parameters( self._callbacks_list, batch_size=args.minibatch_size, saved_model_path=args.output, checkpoint_path=args.checkpoint_dir, ) self._allreduce_trainer = None if self._distribution_strategy == DistributionStrategy.ALLREDUCE: master_addr = args.master_addr.split(":")[0] self._allreduce_trainer = AllReduceTrainer(self._mc, master_addr, self._model, self._loss, self._opt)
def __init__( self, worker_id, job_type, minibatch_size, model_zoo, dataset_fn="dataset_fn", loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", channel=None, embedding_service_endpoint=None, model_def=None, model_params="", data_reader_params="", prediction_outputs_processor="PredictionOutputsProcessor", max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM, get_model_steps=1, ): """ Arguments: worker_id: The worker ID. job_type: The job type. minibatch_size: The size of the minibatch used for each iteration. model_zoo: The directory that contains user-defined model files or a specific model file. dataset_fn: The name of the dataset function defined in the model file. loss: The name of the loss function defined in the model file. optimizer: The name of the optimizer defined in the model file. eval_metrics_fn: The name of the evaluation metrics function defined in the model file. channel: The channel for the gRPC master service. embedding_service_endpoint: The endpoint to the embedding service. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string separated by semi-colon used to instantiate the model, e.g. "param1=1; param2=2". data_reader_params: The data reader parameters in a string separated by semi-colon used to instantiate the data reader, e.g. "param1=1; param2=2". prediction_outputs_processor: The name of the prediction output processor class defined in the model file. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. max_minibatch_retry_num: The maximum number of a minibatch retry as its results (e.g. gradients) are not accepted by master. """ self._worker_id = worker_id self._job_type = job_type self._minibatch_size = minibatch_size ( model_inst, self._dataset_fn, self._loss, self._opt_fn, self._eval_metrics_fn, self._prediction_outputs_processor, ) = get_model_spec( model_zoo=model_zoo, model_def=model_def, dataset_fn=dataset_fn, loss=loss, optimizer=optimizer, eval_metrics_fn=eval_metrics_fn, model_params=model_params, prediction_outputs_processor=prediction_outputs_processor, ) self._embedding_service_endpoint = embedding_service_endpoint self.set_model(model_inst) if channel is None: self._stub = None else: self._stub = elasticdl_pb2_grpc.MasterStub(channel) self._max_minibatch_retry_num = max_minibatch_retry_num self._model_version = -1 self._task_data_service = TaskDataService( self, self._job_type == JobType.TRAINING_WITH_EVALUATION, data_reader_params=get_dict_from_params_str(data_reader_params), ) self._get_model_steps = get_model_steps if self._get_model_steps > 1: self._opt = self._opt_fn() self._non_embed_grads = None
def test_compare_mnist_train(self): ( (x_train, y_train), (x_test, y_test), ) = tf.keras.datasets.mnist.load_data() x_train = tf.convert_to_tensor(x_train, dtype=tf.float32) / 255.0 y_train = tf.convert_to_tensor(y_train, dtype=tf.int32) x_test = tf.convert_to_tensor(x_test, dtype=tf.float32) / 255.0 y_test = tf.convert_to_tensor(y_test, dtype=tf.int32) db = tf.data.Dataset.from_tensor_slices((x_train, y_train)) db = db.batch(self._batch_size).repeat(10) test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test)) test_db = test_db.batch(self._batch_size) tf.keras.backend.clear_session() tf.random.set_seed(22) stop_step = 20 worker_results = self._worker_train( train_db=db, test_db=test_db, dataset="mnist", stop_step=stop_step ) tf.keras.backend.clear_session() tf.random.set_seed(22) acc_meter = tf.keras.metrics.Accuracy() ( model, dataset_fn, loss_fn, opt_fn, eval_metrics_fn, prediction_outputs_processor, ) = get_model_spec( model_zoo=self._model_zoo_path, model_def=( "mnist_functional_api.mnist_functional_api.custom_model" ), dataset_fn="dataset_fn", model_params=None, loss="loss", optimizer="optimizer", eval_metrics_fn="eval_metrics_fn", prediction_outputs_processor="PredictionOutputsProcessor", ) local_results = [] for step, (x, y) in enumerate(db): with tf.GradientTape() as tape: out = model.call(x, training=True) ll = loss_fn(out, y) grads = tape.gradient(ll, model.trainable_variables) opt_fn().apply_gradients(zip(grads, model.trainable_variables)) if step % 20 == 0: for (x, y) in test_db: out = model.call(x, training=False) acc_meter.update_state(tf.argmax(out, axis=1), y) local_results.append( (float(ll.numpy()), float(acc_meter.result().numpy())) ) acc_meter.reset_states() if step > stop_step: break for w, l in zip(worker_results, local_results): self.assertTupleEqual(w, l)