コード例 #1
0
ファイル: model_utils_test.py プロジェクト: zhaozhy/elasticdl
    def test_get_model_spec(self):
        (
            model,
            dataset_fn,
            loss,
            optimizer,
            eval_metrics_fn,
            prediction_outputs_processor,
        ) = get_model_spec(
            model_zoo=_model_zoo_path,
            model_def="test_module.custom_model",
            dataset_fn="dataset_fn",
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            model_params="",
            prediction_outputs_processor="PredictionOutputsProcessor",
        )

        self.assertTrue(model is not None)
        self.assertTrue(dataset_fn is not None)
        self.assertTrue(loss is not None)
        self.assertTrue(optimizer is not None)
        self.assertTrue(eval_metrics_fn is not None)
        self.assertTrue(prediction_outputs_processor is not None)
コード例 #2
0
    def test_compare_mnist_train(self):
        model_def = "mnist_functional_api.mnist_functional_api.custom_model"
        self._create_pserver(model_def, 2)
        db, test_db = get_mnist_dataset(self._batch_size)
        stop_step = 20

        self._create_worker(1)
        worker_results = self._worker_train(0,
                                            train_db=db,
                                            test_db=test_db,
                                            stop_step=stop_step)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        acc_meter = tf.keras.metrics.Accuracy()

        (
            model,
            dataset_fn,
            loss_fn,
            opt_fn,
            eval_metrics_fn,
            prediction_outputs_processor,
            create_data_reader_fn,
            callbacks_list,
        ) = get_model_spec(
            model_zoo=self._model_zoo_path,
            model_def=model_def,
            dataset_fn="dataset_fn",
            model_params=None,
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            prediction_outputs_processor="PredictionOutputsProcessor",
            custom_data_reader="custom_data_reader",
            callbacks="callbacks",
        )
        local_results = []
        for step, (x, y) in enumerate(db):
            with tf.GradientTape() as tape:
                out = model.call(x, training=True)
                ll = loss_fn(y, out)
            grads = tape.gradient(ll, model.trainable_variables)
            opt_fn().apply_gradients(zip(grads, model.trainable_variables))

            if step % 20 == 0:
                for (x, y) in test_db:
                    out = model.call(x, training=False)
                    acc_meter.update_state(tf.argmax(out, axis=1), y)

                local_results.append(
                    (float(ll.numpy()), float(acc_meter.result().numpy())))
                acc_meter.reset_states()

            if step > stop_step:
                break

        for w, l in zip(worker_results, local_results):
            self.assertTupleEqual(w, l)
コード例 #3
0
    def _init_model_from_args(self, args):
        """
        Please refer to elastic/python/common/args.py for more
        details about arguments of a worker.
        """
        self._log_loss_steps = args.log_loss_steps
        (
            model_inst,
            self._feed,
            loss,
            opt_fn,
            self._eval_metrics_fn,
            self._prediction_outputs_processor,
            self._custom_data_reader,
            self._callbacks_list,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            feed=args.feed,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            prediction_outputs_processor=args.prediction_outputs_processor,
            custom_data_reader=args.custom_data_reader,
            callbacks=args.callbacks,
        )

        self._model_handler = ModelHandler.get_model_handler(
            self._distribution_strategy, checkpoint_dir=args.checkpoint_dir)
        self._model_inst = self._model_handler.get_model_to_train(model_inst)
        self._model_inst.optimizer = opt_fn()
        self._model_inst.loss = loss
        self._model_version = -1
        self._get_model_steps = args.get_model_steps
コード例 #4
0
ファイル: worker.py プロジェクト: xiaoyili/elasticdl
    def _init_from_args(self, args):
        """
        Please refer to elastic/python/common/args.py for more
        details about arguments of a worker.
        """
        self._worker_id = args.worker_id
        self._job_type = args.job_type
        self._minibatch_size = args.minibatch_size
        (
            model_inst,
            self._dataset_fn,
            self._loss,
            self._opt_fn,
            self._eval_metrics_fn,
            self._prediction_outputs_processor,
            self._custom_data_reader,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            dataset_fn=args.dataset_fn,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            model_params=args.model_params,
            prediction_outputs_processor=args.prediction_outputs_processor,
            custom_data_reader=args.custom_data_reader,
        )

        self._collective_communicator = (
            CollectiveCommunicator() if self._distribution_strategy
            == DistributionStrategy.ALLREDUCE else None)
        self._model_handler = ModelHandler.get_model_handler(
            self._distribution_strategy, checkpoint_dir=args.checkpoint_dir)
        model_inst = self._model_handler.get_model_to_train(model_inst)
        self.set_model(model_inst)

        self._model_version = -1
        self._model_versions_from_ps = [-1 for _ in range(self._ps_num)]
        self._task_data_service = TaskDataService(
            self,
            self._job_type == JobType.TRAINING_WITH_EVALUATION,
            data_reader_params=get_dict_from_params_str(
                args.data_reader_params),
        )
        if self._dataset_fn is None:
            if hasattr(self._task_data_service.data_reader,
                       "default_dataset_fn"):
                self._dataset_fn = (
                    self._task_data_service.data_reader.default_dataset_fn())
            else:
                raise ValueError(
                    "dataset_fn is required if the data_reader used does "
                    "not provide default implementation of dataset_fn")
        self._get_model_steps = args.get_model_steps
        if self._get_model_steps > 1:
            self._opt = self._opt_fn()
        self._non_embed_grads = {}
        self._evaluation_result = {}
コード例 #5
0
ファイル: worker.py プロジェクト: chunyang-wen/elasticdl
    def _init_from_args(self, args):
        """
        Please refer to elastic/python/common/args.py for more
        details about arguments of a worker.
        """
        self._worker_id = args.worker_id
        self._job_type = args.job_type
        self._minibatch_size = args.minibatch_size
        (
            model_inst,
            self._dataset_fn,
            self._loss,
            self._opt_fn,
            self._eval_metrics_fn,
            self._prediction_outputs_processor,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            dataset_fn=args.dataset_fn,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            model_params=args.model_params,
            prediction_outputs_processor=args.prediction_outputs_processor,
        )

        self._embedding_service_endpoint = eval(
            args.embedding_service_endpoint)

        self._distribution_strategy = args.distribution_strategy
        self._collective_communicator = (
            CollectiveCommunicator() if self._distribution_strategy
            == DistributionStrategy.ALLREDUCE else None)
        self._model_handler = ModelHandler.get_model_handler(
            self._distribution_strategy, checkpoint_dir=args.checkpoint_dir)
        model_inst = self._model_handler.get_model_to_train(model_inst)
        self.set_model(model_inst)

        self._model_version = -1
        self._task_data_service = TaskDataService(
            self,
            self._job_type == JobType.TRAINING_WITH_EVALUATION,
            data_reader_params=get_dict_from_params_str(
                args.data_reader_params),
        )
        self._get_model_steps = args.get_model_steps
        if self._get_model_steps > 1:
            self._opt = self._opt_fn()
            self._non_embed_grads = None
        self._evaluation_result = {}
コード例 #6
0
    def __init__(self, args):
        envs = parse_envs(args.envs)
        self._init_environment(envs)

        (
            self.model_inst,
            self.dataset_fn,
            self.loss_fn,
            self.opt_fn,
            self.eval_metrics_fn,
            self.prediction_outputs_processor,
            self.custom_data_reader,
            self.callback_list,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            dataset_fn=args.dataset_fn,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            model_params=args.model_params,
            prediction_outputs_processor="",
            custom_data_reader=args.custom_data_reader,
            callbacks=args.callbacks,
        )
        self.opt = self.opt_fn()
        self.epoch = args.num_epochs
        self.evaluation_steps = args.evaluation_steps
        self.batch_size = args.minibatch_size
        self.data_reader_params = get_dict_from_params_str(
            args.data_reader_params
        )
        self.records_per_task = (
            args.minibatch_size * args.num_minibatches_per_task
        )

        create_data_reader_fn = (
            create_data_reader
            if self.custom_data_reader is None
            else self.custom_data_reader
        )
        self.data_reader = create_data_reader_fn(
            data_origin=args.training_data,
            records_per_task=self.records_per_task,
            **self.data_reader_params
        )
        self.training_data = args.training_data
        self.validation_data = args.validation_data
        self.save_model_dir = args.output
コード例 #7
0
    def test_get_model_spec(self):
        (
            model,
            dataset_fn,
            loss,
            optimizer,
            eval_metrics_fn,
            prediction_outputs_processor,
            custom_data_reader,
            callback_list,
        ) = get_model_spec(
            model_zoo=_model_zoo_path,
            model_def="test_module.custom_model",
            dataset_fn="dataset_fn",
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            model_params="",
            prediction_outputs_processor="PredictionOutputsProcessor",
            custom_data_reader="custom_data_reader",
            callbacks="callbacks",
        )

        self.assertTrue(model is not None)
        self.assertTrue(dataset_fn is not None)
        self.assertTrue(loss is not None)
        self.assertTrue(optimizer is not None)
        self.assertTrue(eval_metrics_fn is not None)
        self.assertTrue(prediction_outputs_processor is not None)
        self.assertTrue(custom_data_reader is not None)
        self.assertTrue(callback_list is not None)
        self.assertEqual(len(callback_list.callbacks), 1)
        self.assertRaisesRegex(
            Exception,
            "Cannot find the custom model function/class "
            "in model definition files",
            get_model_spec,
            model_zoo=_model_zoo_path,
            model_def="test_module.undefined",
            dataset_fn="dataset_fn",
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            model_params="",
            prediction_outputs_processor="PredictionOutputsProcessor",
            custom_data_reader="custom_data_reader",
            callbacks="callbacks",
        )
コード例 #8
0
    def test_compare_onebatch_train(self):
        model_def = "mnist_functional_api.mnist_functional_api.custom_model"
        self._create_pserver(model_def, 2)
        images, labels = get_random_batch(self._batch_size)
        # TODO(yunjian.lmh): test optimizer wrapper
        arguments = [
            "--worker_id",
            0,
            "--job_type",
            elasticdl_pb2.TRAINING,
            "--minibatch_size",
            self._batch_size,
            "--model_zoo",
            self._model_zoo_path,
            "--model_def",
            model_def,
            "--distribution_strategy",
            DistributionStrategy.PARAMETER_SERVER,
        ]
        args = parse_worker_args(arguments)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        worker = Worker(args, ps_channels=self._channels)
        worker._run_model_call_before_training(images)
        worker.get_model()
        w_loss, w_grads = worker.training_process_eagerly(images, labels)
        worker.report_gradient(w_grads)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        (
            model,
            dataset_fn,
            loss_fn,
            opt_fn,
            eval_metrics_fn,
            prediction_outputs_processor,
            create_data_reader_fn,
            callback_list,
        ) = get_model_spec(
            model_zoo=self._model_zoo_path,
            model_def=model_def,
            dataset_fn="dataset_fn",
            model_params=None,
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            prediction_outputs_processor="PredictionOutputsProcessor",
            custom_data_reader="custom_data_reader",
            callbacks="callbacks",
        )

        with tf.GradientTape() as tape:
            output = model.call(images, training=True)
            labels = tf.reshape(labels, [-1])
            loss = loss_fn(labels, output)
        grads = tape.gradient(loss, model.trainable_variables)
        opt_fn().apply_gradients(zip(grads, model.trainable_variables))

        for v in model.trainable_variables:
            ps_id = string_to_id(v.name, len(self._channels))
            ps_v = self._pservers[ps_id].parameters.get_non_embedding_param(
                v.name)
            np.testing.assert_array_equal(ps_v.numpy(), v.numpy())
コード例 #9
0
    def _init_from_args(self, args):
        """
        Please refer to elastic/python/common/args.py for more
        details about arguments of a worker.
        """
        self._worker_id = args.worker_id
        self._job_type = args.job_type
        self._minibatch_size = args.minibatch_size
        self._log_loss_steps = args.log_loss_steps
        (
            model_inst,
            self._dataset_fn,
            self._loss,
            self._opt_fn,
            self._eval_metrics_fn,
            self._prediction_outputs_processor,
            self._custom_data_reader,
            self._callbacks_list,
        ) = get_model_spec(
            model_zoo=args.model_zoo,
            model_def=args.model_def,
            dataset_fn=args.dataset_fn,
            loss=args.loss,
            optimizer=args.optimizer,
            eval_metrics_fn=args.eval_metrics_fn,
            model_params=args.model_params,
            prediction_outputs_processor=args.prediction_outputs_processor,
            custom_data_reader=args.custom_data_reader,
            callbacks=args.callbacks,
        )

        self._model_handler = ModelHandler.get_model_handler(
            self._distribution_strategy, checkpoint_dir=args.checkpoint_dir)
        model_inst = self._model_handler.get_model_to_train(model_inst)
        self.set_model(model_inst)

        self._model_version = -1
        self._task_data_service = TaskDataService(
            self._mc,
            self._job_type == JobType.TRAINING_WITH_EVALUATION,
            custom_data_reader=self._custom_data_reader,
            data_reader_params=get_dict_from_params_str(
                args.data_reader_params),
            data_origin=args.training_data,
        )
        if self._dataset_fn is None:
            if hasattr(self._task_data_service.data_reader,
                       "default_dataset_fn"):
                self._dataset_fn = (
                    self._task_data_service.data_reader.default_dataset_fn())
            else:
                raise ValueError(
                    "dataset_fn is required if the data_reader used does "
                    "not provide default implementation of dataset_fn")
        self._get_model_steps = args.get_model_steps
        self._opt = self._opt_fn()
        self._model.optimizer = self._opt
        self._non_embed_grads = {}
        self._evaluation_result = {}

        saved_model_exporter = SavedModelExporter(self._task_data_service,
                                                  self._dataset_fn,
                                                  self._model_handler)
        # Place default callbacks at the head to execute them firstly
        self._callbacks_list.callbacks.insert(0, saved_model_exporter)
        self._callbacks_list.set_model(model_inst)
        set_callback_parameters(
            self._callbacks_list,
            batch_size=args.minibatch_size,
            saved_model_path=args.output,
            checkpoint_path=args.checkpoint_dir,
        )

        self._allreduce_trainer = None
        if self._distribution_strategy == DistributionStrategy.ALLREDUCE:
            master_addr = args.master_addr.split(":")[0]
            self._allreduce_trainer = AllReduceTrainer(self._mc, master_addr,
                                                       self._model, self._loss,
                                                       self._opt)
コード例 #10
0
ファイル: worker.py プロジェクト: zhaozhy/elasticdl
    def __init__(
        self,
        worker_id,
        job_type,
        minibatch_size,
        model_zoo,
        dataset_fn="dataset_fn",
        loss="loss",
        optimizer="optimizer",
        eval_metrics_fn="eval_metrics_fn",
        channel=None,
        embedding_service_endpoint=None,
        model_def=None,
        model_params="",
        data_reader_params="",
        prediction_outputs_processor="PredictionOutputsProcessor",
        max_minibatch_retry_num=DEFAULT_MAX_MINIBATCH_RETRY_NUM,
        get_model_steps=1,
    ):
        """
        Arguments:
            worker_id: The worker ID.
            job_type: The job type.
            minibatch_size: The size of the minibatch used for each iteration.
            model_zoo: The directory that contains user-defined model files
                or a specific model file.
            dataset_fn: The name of the dataset function defined in the
                model file.
            loss: The name of the loss function defined in the model file.
            optimizer: The name of the optimizer defined in the model file.
            eval_metrics_fn: The name of the evaluation metrics function
                defined in the model file.
            channel: The channel for the gRPC master service.
            embedding_service_endpoint: The endpoint to the embedding service.
            model_def: The import path to the model definition
                function/class in the model zoo, e.g.
                "cifar10_subclass.CustomModel".
            model_params: The dictionary of model parameters in a string
                separated by semi-colon used to instantiate the model,
                e.g. "param1=1; param2=2".
            data_reader_params: The data reader parameters in a string
                separated by semi-colon used to instantiate the data reader,
                e.g. "param1=1; param2=2".
            prediction_outputs_processor: The name of the prediction output
                processor class defined in the model file.
            get_model_steps: Worker will perform `get_model` from the
                parameter server every this many steps.
            max_minibatch_retry_num: The maximum number of a minibatch retry
                as its results (e.g. gradients) are not accepted by master.
        """
        self._worker_id = worker_id
        self._job_type = job_type
        self._minibatch_size = minibatch_size
        (
            model_inst,
            self._dataset_fn,
            self._loss,
            self._opt_fn,
            self._eval_metrics_fn,
            self._prediction_outputs_processor,
        ) = get_model_spec(
            model_zoo=model_zoo,
            model_def=model_def,
            dataset_fn=dataset_fn,
            loss=loss,
            optimizer=optimizer,
            eval_metrics_fn=eval_metrics_fn,
            model_params=model_params,
            prediction_outputs_processor=prediction_outputs_processor,
        )
        self._embedding_service_endpoint = embedding_service_endpoint
        self.set_model(model_inst)

        if channel is None:
            self._stub = None
        else:
            self._stub = elasticdl_pb2_grpc.MasterStub(channel)
        self._max_minibatch_retry_num = max_minibatch_retry_num
        self._model_version = -1
        self._task_data_service = TaskDataService(
            self,
            self._job_type == JobType.TRAINING_WITH_EVALUATION,
            data_reader_params=get_dict_from_params_str(data_reader_params),
        )
        self._get_model_steps = get_model_steps
        if self._get_model_steps > 1:
            self._opt = self._opt_fn()
            self._non_embed_grads = None
コード例 #11
0
    def test_compare_mnist_train(self):
        (
            (x_train, y_train),
            (x_test, y_test),
        ) = tf.keras.datasets.mnist.load_data()
        x_train = tf.convert_to_tensor(x_train, dtype=tf.float32) / 255.0
        y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)

        x_test = tf.convert_to_tensor(x_test, dtype=tf.float32) / 255.0
        y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)

        db = tf.data.Dataset.from_tensor_slices((x_train, y_train))
        db = db.batch(self._batch_size).repeat(10)
        test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test))
        test_db = test_db.batch(self._batch_size)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)
        stop_step = 20
        worker_results = self._worker_train(
            train_db=db, test_db=test_db, dataset="mnist", stop_step=stop_step
        )

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        acc_meter = tf.keras.metrics.Accuracy()

        (
            model,
            dataset_fn,
            loss_fn,
            opt_fn,
            eval_metrics_fn,
            prediction_outputs_processor,
        ) = get_model_spec(
            model_zoo=self._model_zoo_path,
            model_def=(
                "mnist_functional_api.mnist_functional_api.custom_model"
            ),
            dataset_fn="dataset_fn",
            model_params=None,
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            prediction_outputs_processor="PredictionOutputsProcessor",
        )
        local_results = []
        for step, (x, y) in enumerate(db):

            with tf.GradientTape() as tape:
                out = model.call(x, training=True)
                ll = loss_fn(out, y)
            grads = tape.gradient(ll, model.trainable_variables)
            opt_fn().apply_gradients(zip(grads, model.trainable_variables))

            if step % 20 == 0:
                for (x, y) in test_db:
                    out = model.call(x, training=False)
                    acc_meter.update_state(tf.argmax(out, axis=1), y)

                local_results.append(
                    (float(ll.numpy()), float(acc_meter.result().numpy()))
                )
                acc_meter.reset_states()

            if step > stop_step:
                break

        for w, l in zip(worker_results, local_results):
            self.assertTupleEqual(w, l)