Ejemplo n.º 1
0
    def testEvaluationOnly(self):
        task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1)

        evaluation_service = EvaluationService(
            None, None, task_d, 0, 0, 0, True
        )
        task_d.set_evaluation_service(evaluation_service)

        master = MasterServicer(
            2,
            2,
            None,
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
        )
        master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

        self.assertEqual(8, len(task_d._todo))
        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job.finished())
Ejemplo n.º 2
0
    def testUserDefinedModel(self):
        master = MasterServicer(
            2,
            3,
            None,
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        req = elasticdl_pb2.GetModelRequest()
        req.method = elasticdl_pb2.MINIMUM
        req.version = 0

        model_inst = SimpleModel()
        model_inst.build(SimpleModel.input_shapes())
        for variable in model_inst.trainable_variables:
            master.set_model_var(variable.name, variable.numpy())
        # Get version 0
        model = master.GetModel(req, None)
        self.assertEqual(0, model.version)
        self.assertEqual(
            [
                "dense_1/bias:0",
                "dense_1/kernel:0",
                "dense_2/bias:0",
                "dense_2/kernel:0",
            ],
            list(sorted(model.param.keys())),
        )
Ejemplo n.º 3
0
    def testEvaluationService(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testEvaluationService")
            checkpoint_service = CheckpointService(chkp_dir, 5, 5, True)
            task_d = _TaskDispatcher(
                {"f1": (0, 10), "f2": (0, 10)},
                {"f1": (0, 10), "f2": (0, 10)},
                {},
                3,
                1,
            )

            # Evaluation metrics will not be accepted if no evaluation ongoing
            evaluation_service = EvaluationService(
                checkpoint_service, None, task_d, 10, 20, 0, False
            )
            evaluation_metrics = {
                "mse": ndarray_to_tensor(
                    np.array([100, 200], dtype=np.float32)
                )
            }
            self.assertFalse(
                evaluation_service.report_evaluation_metrics(
                    1, evaluation_metrics
                )
            )

            # No checkpoint available
            self.assertFalse(evaluation_service.try_to_create_new_job())

            master = MasterServicer(
                2,
                2,
                None,
                task_d,
                init_var=[],
                checkpoint_filename_for_init="",
                checkpoint_service=checkpoint_service,
                evaluation_service=evaluation_service,
            )
            master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

            # Add an evaluation task and we can start evaluation
            self.assertEqual(8, len(task_d._todo))
            evaluation_service.add_evaluation_task(0)
            self.assertEqual(16, len(task_d._todo))
            self.assertFalse(evaluation_service._eval_job.finished())

            for i in range(8):
                self.assertFalse(evaluation_service._eval_job.finished())
                evaluation_service.complete_task()
            self.assertTrue(evaluation_service._eval_job is None)
            self.assertFalse(evaluation_service.try_to_create_new_job())
Ejemplo n.º 4
0
    def testGetModel(self):
        master = MasterServicer(
            2,
            3,
            None,
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))
        # Now master model is version 0
        self.assertEqual(0, master._version)

        # Get version 0 with minimum method
        req = elasticdl_pb2.GetModelRequest()
        req.version = 0
        req.method = elasticdl_pb2.MINIMUM
        model = master.GetModel(req, None)
        self.assertEqual(0, model.version)
        self.assertEqual(["x"], list(model.param.keys()))
        np.testing.assert_array_equal(
            np.array([1.0, 1.0]), tensor_to_ndarray(model.param["x"])
        )

        # Increase master model version to 1, but still request
        # version 0 with minimum method, we should get version 1
        master._version = 1
        master.set_model_var("x", np.array([2.0, 2.0], dtype=np.float32))
        master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32))
        model = master.GetModel(req, None)
        self.assertEqual(1, model.version)
        self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
        np.testing.assert_array_equal(
            np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
        )
        np.testing.assert_array_equal(
            np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
        )

        # Try to get version 2, it should raise exception.
        req.version = 2
        self.assertRaises(ValueError, master.GetModel, req, None)

        # Get fixed version 1
        req.method = elasticdl_pb2.FIXED
        req.version = 1
        model = master.GetModel(req, None)
        self.assertEqual(1, model.version)
        self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
        np.testing.assert_array_equal(
            np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
        )
        np.testing.assert_array_equal(
            np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
        )

        # Previous model unavailable due to no checkpoint
        req.version = 0
        model = master.GetModel(req, None)
        self.assertFalse(model.param)

        # Previous model available through checkpoint
        with tempfile.TemporaryDirectory() as tempdir:
            chk_dir = os.path.join(tempdir, "testGetModel")
            os.makedirs(chk_dir)
            req.version = master._version
            req.method = elasticdl_pb2.MINIMUM
            model = master.GetModel(req, None)
            master._checkpoint_service = CheckpointService(
                chk_dir, 2, 5, False
            )
            master._checkpoint_service.save(master._version, model, False)
            master._version = 2
            master.set_model_var("z", np.array([2.0, 2.0], dtype=np.float32))
            req.version = 1
            req.method = elasticdl_pb2.FIXED
            model = master.GetModel(req, None)
            self.assertEqual(1, model.version)
            self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
            np.testing.assert_array_equal(
                np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
            )
            np.testing.assert_array_equal(
                np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
            )
Ejemplo n.º 5
0
    def testReportGradient(self):
        def makeGrad():
            """ Make a ReportGradientRequest compatible with model"""
            req = elasticdl_pb2.ReportGradientRequest()
            req.gradient["x"].CopyFrom(
                ndarray_to_tensor(np.array([0.1], dtype=np.float32))
            )
            req.gradient["y"].CopyFrom(
                ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32))
            )
            req.model_version = 1
            return req

        master = MasterServicer(
            3,
            3,
            tf.optimizers.SGD(0.1),
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        master._version = 1
        master.set_model_var("x", np.array([2.0], dtype=np.float32))
        master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32))

        # Report a future version, should raise exception
        req = makeGrad()
        req.model_version = 2
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an old version, should not be accepted
        req = makeGrad()
        req.model_version = 0
        res = master.ReportGradient(req, None)
        self.assertFalse(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a unknown gradient, should raise.
        req = makeGrad()
        req.gradient["z"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an incompatible gradient, should raise.
        req = makeGrad()
        req.gradient["y"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report a current version, should be accepted.
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a current version with part of gradients, should be accepted.
        req = makeGrad()
        del req.gradient["y"]
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)
        # Gradient should be accumulated.
        np.testing.assert_array_equal(
            np.array([0.2], dtype=np.float32), master._gradient_sum["x"]
        )
        np.testing.assert_array_equal(
            np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"]
        )
        self.assertEqual(2, master._grad_n)

        # Report a current version, should be accepted, and a new version
        # created
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(2, res.model_version)
        self.assertFalse(master._gradient_sum)
        self.assertEqual(0, master._grad_n)
        np.testing.assert_array_equal(
            # [2] - 0.1 * [0.1]
            np.array([1.99], dtype=np.float32),
            master._model["x"].numpy(),
        )
        np.testing.assert_array_equal(
            # [12, 13] - 0.1 * [0.02, 0.04]
            np.array([11.998, 12.996], dtype=np.float32),
            master._model["y"].numpy(),
        )
Ejemplo n.º 6
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A python bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--get_model_steps",
        get_model_steps,
    ]
    args = parse_worker_args(arguments)
    worker = Worker(args)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__
    checkpoint_service = CheckpointService("", 0, 0, True)
    if training:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            1,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            0,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)
    grads_to_wait = 1 if use_async else 2
    master = MasterServicer(
        grads_to_wait,
        batch_size,
        worker._opt_fn(),
        task_d,
        init_var=[],
        checkpoint_filename_for_init="",
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        use_async=use_async,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]
    worker._stub = InProcessMaster(master, callbacks)

    for var in worker._model.trainable_variables:
        master.set_model_var(var.name, var.numpy())

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.GetTask(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
Ejemplo n.º 7
0
class ParameterSeverModelHandlerTest(unittest.TestCase):
    def setUp(self):
        tf.keras.backend.clear_session()
        self.master = MasterServicer(
            2,
            3,
            None,
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        self.master._version = 1
        self.model_handler = ModelHandler.get_model_handler(
            distribution_strategy="ParameterServerStrategy", stub=self.master)

    def test_get_model_to_train(self):
        model_inst = custom_model_with_embedding()
        model_inst = self.model_handler.get_model_to_train(model_inst)
        self.assertEqual(type(model_inst.layers[1]), Embedding)

    def test_get_model_to_export(self):
        model_inst = custom_model_with_embedding()
        trained_params = _mock_model_trained_params(model_inst)
        for name, value in trained_params.items():
            self.master.set_model_var(name, value)

        train_model = self.model_handler.get_model_to_train(model_inst)
        export_model = self.model_handler.get_model_to_export(train_model,
                                                              dataset=None)

        test_data = tf.constant([0])
        result = export_model.call(test_data).numpy()
        self.assertEqual(result[0][0], 3.0)

    def test_get_subclass_model_to_export(self):
        def _get_dataset():
            dataset = tf.data.Dataset.from_tensor_slices(
                np.random.randint(0, 10, (10, 4)))
            dataset = dataset.batch(2)
            return dataset

        model_inst = CustomModel()
        dataset = _get_dataset()

        trained_params = {
            "custom_model/embedding/embeddings:0": np.ones((4, 2),
                                                           dtype="float32"),
            "custom_model/dense/kernel:0": np.ones((2, 1), dtype="float32"),
            "custom_model/dense/bias:0": np.ones((1), dtype="float32"),
        }

        for name, value in trained_params.items():
            self.master.set_model_var(name, value)

        train_model = self.model_handler.get_model_to_train(model_inst)
        self.assertEqual(type(train_model.embedding), Embedding)

        export_model = self.model_handler.get_model_to_export(train_model,
                                                              dataset=dataset)

        test_data = tf.constant([0])
        result = export_model.call(test_data).numpy()
        self.assertEqual(result[0][0], 3.0)
Ejemplo n.º 8
0
    def distributed_train_and_evaluate(
        self,
        feature_shape,
        model_def,
        model_params="",
        training=True,
        dataset="",
    ):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """
        job_type = (JobType.TRAINING_ONLY
                    if training else JobType.EVALUATION_ONLY)
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def=model_def,
            model_params=model_params,
            channel=None,
        )

        if dataset == "imagenet":
            batch_size = 8
            shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)}
        elif dataset == "frappe":
            shards = {
                create_frappe_recordio_file(16, feature_shape, 5383): (0, 16)
            }
        else:
            shards = {create_recordio_file(128, feature_shape): (0, 128)}

        if training:
            training_shards = shards
            evaluation_shards = shards
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        # Initialize checkpoint service
        checkpoint_service = CheckpointService("", 0, 0, True)
        if training:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 1, False)
        else:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 0, True)
        task_d.set_evaluation_service(evaluation_service)
        # The master service
        master = MasterServicer(
            2,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=checkpoint_service,
            evaluation_service=evaluation_service,
        )
        worker._stub = InProcessMaster(master)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)
Ejemplo n.º 9
0
    def distributed_train_and_evaluate(self, training=True):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """
        class _Master(InProcessMaster):
            def ReportGradient(self, req):
                if 2 < self._m._version < 80:
                    # For testing of retrain when gradient not accepted.
                    # Increase master version to reject the gradient.
                    self._m._version += 1
                return self._m.ReportGradient(req, None)

            def ReportEvaluationMetrics(self, req):
                if 2 < self._m._version < 80:
                    # Testing of evaluation retries. Increase the master
                    # version so the evaluation metrics will not be accepted.
                    self._m._version += 1
                return self._m.ReportEvaluationMetrics(req, None)

        job_type = (JobType.TRAINING_ONLY
                    if training else JobType.EVALUATION_ONLY)
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def="test_module.custom_model",
            channel=None,
        )

        shards = {create_recordio_file(128): 128}
        if training:
            training_shards = shards
            evaluation_shards = {}
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        if not training:
            evaluation_service = EvaluationService(None, None, task_d, 0, 0, 0,
                                                   True)
            task_d.set_evaluation_service(evaluation_service)
        else:
            evaluation_service = None
        master = MasterServicer(
            2,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
        )
        worker._stub = _Master(master)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)
Ejemplo n.º 10
0
    def distributed_train_and_evaluate(
        self,
        training=True,
        callback_classes=[],
        use_async=False,
        grads_to_wait=2,
        get_model_steps=1,
    ):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """

        if use_async and grads_to_wait > 1:
            raise ValueError(
                "grads_to_wait should be 1 when using asynchronous SGD."
            )

        job_type = (
            JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY
        )
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def="test_module.custom_model",
            channel=None,
            get_model_steps=get_model_steps,
        )

        shards = {create_recordio_file(128): (0, 128)}
        if training:
            training_shards = shards
            evaluation_shards = {}
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        if not training:
            evaluation_service = EvaluationService(
                None, None, task_d, 0, 0, 0, True
            )
            task_d.set_evaluation_service(evaluation_service)
        else:
            evaluation_service = None
        master = MasterServicer(
            grads_to_wait,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
            use_async=use_async,
        )
        callbacks = [
            callback_class(master, worker, self)
            for callback_class in callback_classes
        ]
        worker._stub = InProcessMaster(master, callbacks)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)