Esempio n. 1
0
    def _create_master_and_worker(self,
                                  service_endpoint=None,
                                  embedding_dims={}):
        model_inst = custom_model()
        master = MasterServicer(
            2,
            2,
            tf.optimizers.SGD(0.1),
            None,
            init_var=model_inst.trainable_variables,
            embedding_service_endpoint=service_endpoint,
            embedding_dims=embedding_dims,
            checkpoint_filename_for_init=None,
            checkpoint_service=None,
            evaluation_service=None,
        )
        arguments = [
            "--worker_id",
            1,
            "--job_type",
            JobType.TRAINING_ONLY,
            "--minibatch_size",
            2,
            "--model_zoo",
            _model_zoo_path,
            "--model_def",
            "test_module.custom_model",
        ]
        args = parse_worker_args(arguments)

        worker = Worker(args)
        worker.set_model(model_inst)
        worker._stub = InProcessMaster(master)

        return master, worker
Esempio n. 2
0
def main():
    args = parse_worker_args()
    channel = grpc.insecure_channel(
        args.master_addr,
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )

    logger = log_util.get_logger(__name__)

    logger.info("Starting worker %d", args.worker_id)
    worker = Worker(
        args.worker_id,
        args.job_type,
        args.minibatch_size,
        args.model_zoo,
        channel=channel,
        embedding_service_endpoint=eval(args.embedding_service_endpoint),
        dataset_fn=args.dataset_fn,
        loss=args.loss,
        optimizer=args.optimizer,
        eval_metrics_fn=args.eval_metrics_fn,
        model_def=args.model_def,
        model_params=args.model_params,
        get_model_steps=args.get_model_steps,
    )
    worker.run()
Esempio n. 3
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    logger.info("Starting worker %d", args.worker_id)
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")

    master_channel = build_channel(args.master_addr)

    ps_channels = []
    if args.ps_addrs:
        ps_addrs = args.ps_addrs.split(",")

        for addr in ps_addrs:
            # addr is in the form as "ps-pod-name.namespace.svc:port"
            channel = build_channel(addr)

            # Wait the channel is ready by a Future object.
            grpc.channel_ready_future(channel).result()
            logger.info("grpc channel %s to connect pod %s is ready" %
                        (addr, addr.split(".")[0]))
            ps_channels.append(channel)

    worker = Worker(args, channel=master_channel, ps_channels=ps_channels)
    worker.run()
Esempio n. 4
0
def main():
    args = parse_worker_args()
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")
    channel = grpc.insecure_channel(
        args.master_addr,
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )

    # TODO, create PS channels here
    ps_addrs = args.ps_addrs.split(",")
    # Just print ps_addrs out to avoid flake8 failure
    # This print can be removed once we initialize ps_channels
    # by using ps_addrs
    print("Parameter server addresses are %s" % ps_addrs)
    ps_channels = None

    logger = log_utils.get_logger(__name__)

    logger.info("Starting worker %d", args.worker_id)
    worker = Worker(args, channel=channel, ps_channels=ps_channels)
    worker.run()
Esempio n. 5
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    logger.info("Starting worker %d", args.worker_id)
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")

    master_channel = build_channel(args.master_addr)

    ps_channels = []
    if args.ps_addrs:
        # TODO: use ps_addrs from master directly after ps service is working.
        #       Get ps pod ip for ps grpc connection for now.
        ps_addrs = args.ps_addrs.split(",")

        config.load_incluster_config()
        api = client.CoreV1Api()

        for addr in ps_addrs:
            # addr is in the form as "ps-pod-name.namespace.svc:port"
            addr_splitted = addr.split(".")
            while True:
                pod = api.read_namespaced_pod(
                    namespace=addr_splitted[1], name=addr_splitted[0]
                )
                if pod.status.pod_ip:
                    break
                # If ps pod is not ready yet, sleep 2 seconds and try again.
                time.sleep(2)
            addr = pod.status.pod_ip + ":" + addr.split(":")[-1]
            channel = grpc.insecure_channel(
                addr,
                options=[
                    (
                        "grpc.max_send_message_length",
                        GRPC.MAX_SEND_MESSAGE_LENGTH,
                    ),
                    (
                        "grpc.max_receive_message_length",
                        GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
                    ),
                ],
            )

            # Wait the channel is ready by a Future object.
            grpc.channel_ready_future(channel).result()
            logger.info(
                "grpc channel %s to connect pod %s is ready"
                % (addr, pod.metadata.name)
            )
            ps_channels.append(channel)

    worker = Worker(args, channel=master_channel, ps_channels=ps_channels)
    worker.run()
Esempio n. 6
0
    def test_restart_ps(self):
        model_def = "mnist.mnist_functional_api.custom_model"
        num_data = 8
        training_data = [
            get_random_batch(self._batch_size) for _ in range(num_data)
        ]
        workers = []
        self._create_pserver(model_def, 2)
        for w in range(2):
            self._reset_pserver()
            arguments = [
                "--worker_id",
                0,
                "--job_type",
                elasticdl_pb2.TRAINING,
                "--minibatch_size",
                self._batch_size,
                "--model_zoo",
                self._model_zoo_path,
                "--model_def",
                model_def,
                "--distribution_strategy",
                DistributionStrategy.PARAMETER_SERVER,
            ]
            args = parse_worker_args(arguments)
            tf.keras.backend.clear_session()
            tf.random.set_seed(22)
            worker = Worker(args, ps_client=PSClient(self._channels))
            workers.append(worker)
            worker._trainer._run_model_call_before_training(
                training_data[0][0])
            for i in range(num_data):
                worker._trainer._get_model()
                w_loss, w_grads = worker._trainer._training_process_eagerly(
                    training_data[i][0], training_data[i][1])
                worker._trainer._report_gradient(w_grads)
                if w == 1 and i == 3:
                    # Restart ps for the 2nd worker at i==3
                    # self._restart_pserver(model_def)
                    self._reset_pserver()
                    # `push_dense_parameters` will be called in `get_model` to
                    # initialize variables on ps with worker variables
                    worker._trainer._get_model()
                    # send the grads again as these grads are not applied
                    # on worker variables
                    worker._trainer._report_gradient(w_grads)

        for var_name in workers[0]._trainer._non_embed_vars:
            np.testing.assert_array_equal(
                workers[0]._trainer._non_embed_vars[var_name].numpy(),
                workers[1]._trainer._non_embed_vars[var_name].numpy(),
            )
        self._close_channels()
Esempio n. 7
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    logger.info("Starting worker %d", args.worker_id)
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")

    master_channel = grpc.insecure_channel(
        args.master_addr,
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )

    ps_channels = []
    if args.ps_addrs:
        # TODO: use ps_addrs from master directly after ps service is working.
        #       Get ps pod ip for ps grpc connection for now.
        ps_addrs = args.ps_addrs.split(",")
        from kubernetes import client, config

        config.load_incluster_config()
        api = client.CoreV1Api()

        for addr in ps_addrs:
            # addr is in the form as "ps-pod-name.namespace.svc:port"
            addr_splitted = addr.split(".")
            pod = api.read_namespaced_pod(namespace=addr_splitted[1],
                                          name=addr_splitted[0])
            addr = pod.status.pod_ip + ":" + addr.split(":")[-1]
            channel = grpc.insecure_channel(
                addr,
                options=[
                    (
                        "grpc.max_send_message_length",
                        GRPC.MAX_SEND_MESSAGE_LENGTH,
                    ),
                    (
                        "grpc.max_receive_message_length",
                        GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
                    ),
                ],
            )
            ps_channels.append(channel)

    worker = Worker(args, channel=master_channel, ps_channels=ps_channels)
    worker.run()
Esempio n. 8
0
    def test_worker_pull_embedding(self):
        model_def = "mnist_functional_api.mnist_functional_api.custom_model"
        self._create_pserver(model_def, 2)
        arguments = [
            "--worker_id",
            0,
            "--job_type",
            elasticdl_pb2.TRAINING,
            "--minibatch_size",
            self._batch_size,
            "--model_zoo",
            self._model_zoo_path,
            "--model_def",
            model_def,
            "--distribution_strategy",
            DistributionStrategy.PARAMETER_SERVER,
        ]
        args = parse_worker_args(arguments)
        worker = Worker(args, ps_channels=self._channels)

        # Test lookup embedding vectors that do not exist
        layers = ["test-2", "test-2-slot"]
        ids = [3, 5, 1, 6, 10, 2, 1, 2, 4, 7, 9]
        embedding_table_args = [
            (layers[0], 8, "uniform", False),
            (layers[1], 8, 3.3, True),
        ]

        # initialize embedding table object
        for pserver in self._pservers:
            for layer, table_args in zip(layers, embedding_table_args):
                pserver.parameters.embedding_params[layer] = EmbeddingTable(
                    *table_args
                )

        result_dict = {}
        for layer in layers:
            embedding = worker.pull_embedding_vectors(layer, ids)
            result_dict[layer] = embedding

        for layer in layers:
            expected_result = []
            for embedding_id in ids:
                ps_id = int_to_id(embedding_id, len(self._pservers))
                table = self._pservers[ps_id].parameters.embedding_params[
                    layer
                ]
                expected_result.append(table.get([embedding_id]))
            expected_result = np.concatenate(expected_result)
            self.assertTrue(np.allclose(expected_result, result_dict[layer]))
Esempio n. 9
0
    def test_restart_ps(self):
        num_data = 8
        training_data = [
            random_batch(self._batch_size) for _ in range(num_data)
        ]
        workers = []
        for w in range(2):
            self._restart_pserver()
            tf.keras.backend.clear_session()
            tf.random.set_seed(22)
            arguments = [
                "--worker_id",
                0,
                "--job_type",
                elasticdl_pb2.TRAINING,
                "--minibatch_size",
                self._batch_size,
                "--model_zoo",
                self._model_zoo_path,
                "--model_def",
                self._model_def,
                "--distribution_strategy",
                "ParameterServerStrategy",
            ]
            args = parse_worker_args(arguments)

            worker = Worker(args, ps_channels=self._channel)
            workers.append(worker)
            worker._run_model_call_before_training(training_data[0][0])
            for i in range(num_data):
                worker.get_model(0, elasticdl_pb2.MINIMUM)
                w_loss, w_grads = worker.training_process_eagerly(
                    training_data[i][0], training_data[i][1]
                )
                worker.report_gradient(w_grads)
                if w == 1 and i == 3:
                    # Restart ps for the 2nd worker at i==3
                    self._restart_pserver()
                    # `report_variable` will be called in `get_model` to
                    # initialize variables on ps with worker variables
                    worker.get_model(0, elasticdl_pb2.MINIMUM)
                    # send the grads again as these grads are not applied
                    # on worker variables
                    worker.report_gradient(w_grads)

        for var_name in workers[0]._non_embed_vars:
            np.testing.assert_array_equal(
                workers[0]._non_embed_vars[var_name].numpy(),
                workers[1]._non_embed_vars[var_name].numpy(),
            )
Esempio n. 10
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    logger.info("Starting worker %d", args.worker_id)
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")

    master_channel = build_channel(args.master_addr)

    ps_channels = []
    if args.ps_addrs:
        ps_addrs = args.ps_addrs.split(",")

        for addr in ps_addrs:
            # addr is in the form as "ps-pod-name.namespace.svc:port"
            channel = build_channel(addr)

            succeeded = False
            for i in range(CONNECT_PS_MAX_RETRIES):
                try:
                    grpc.channel_ready_future(channel).result(
                        timeout=CONNECT_PS_TIMEOUT)
                    logger.info("grpc channel %s to connect pod %s is ready" %
                                (addr, addr.split(".")[0]))
                    ps_channels.append(channel)
                    succeeded = True
                    break
                except grpc.FutureTimeoutError:
                    logger.warning("Failed to connect pod %s with %d retry" %
                                   (addr.split(".")[0], i))
            if not succeeded:
                raise TimeoutError(
                    "Time out to connect pod %s with 3 retries" %
                    addr.split(".")[0])

    if args.distribution_strategy == DistributionStrategy.ALLREDUCE:
        logger.info("Wait for %s seconds for FTLib consensus service to "
                    "detect the worker pod" %
                    str(_ALLREDUCE_STRATEGY_WARM_UP_SECS))
        time.sleep(_ALLREDUCE_STRATEGY_WARM_UP_SECS)

    worker = Worker(
        args,
        channel=master_channel,
        ps_channels=ps_channels,
        set_parallelism=True,
    )
    worker.run()
Esempio n. 11
0
 def test_embedding_layer(self):
     arguments = [
         "--worker_id",
         1,
         "--job_type",
         JobType.TRAINING_ONLY,
         "--minibatch_size",
         32,
         "--model_zoo",
         _model_zoo_path,
         "--model_def",
         "embedding_test_module.EdlEmbeddingModel",
     ]
     args = parse_worker_args(arguments)
     worker = Worker(args)
     self.assertTrue(len(worker._embedding_layers) == 2)
Esempio n. 12
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    logger.info("Starting worker %d", args.worker_id)
    if args.master_addr is None:
        raise ValueError("master_addr is missing for worker")

    master_client = MasterClient(build_channel(args.master_addr),
                                 args.worker_id)

    ps_client = None
    if (args.distribution_strategy == DistributionStrategy.PARAMETER_SERVER
            and args.ps_addrs):
        ps_channels = []
        ps_addrs = args.ps_addrs.split(",")

        for addr in ps_addrs:
            # addr is in the form as "ps-pod-name.namespace.svc:port"
            channel = build_channel(addr)

            succeeded = False
            for i in range(CONNECT_PS_MAX_RETRIES):
                try:
                    grpc.channel_ready_future(channel).result(
                        timeout=CONNECT_PS_TIMEOUT)
                    logger.info("grpc channel %s to connect pod %s is ready" %
                                (addr, addr.split(".")[0]))
                    ps_channels.append(channel)
                    succeeded = True
                    break
                except grpc.FutureTimeoutError:
                    logger.warning("Failed to connect pod %s with %d retry" %
                                   (addr.split(".")[0], i))
            if not succeeded:
                raise TimeoutError(
                    "Time out to connect pod %s with 3 retries" %
                    addr.split(".")[0])
        ps_client = PSClient(ps_channels)

    worker = Worker(
        args,
        master_client=master_client,
        ps_client=ps_client,
        set_parallelism=True,
    )
    worker.run()
Esempio n. 13
0
 def _create_worker(self, worker_num):
     for i in range(worker_num):
         tf.keras.backend.clear_session()
         tf.random.set_seed(22)
         arguments = [
             "--job_type",
             elasticai_api_pb2.TRAINING,
             "--minibatch_size",
             self._batch_size,
             "--model_zoo",
             self._model_zoo_path,
             "--model_def",
             self._model_def,
             "--distribution_strategy",
             DistributionStrategy.PARAMETER_SERVER,
         ]
         args = parse_worker_args(arguments)
         worker = Worker(args, ps_client=PSClient(self._channels))
         self._workers.append(worker)
 def _create_worker(self, worker_num, max_allreduce_retry_num=0):
     for i in range(worker_num):
         arguments = [
             "--worker_id",
             i,
             "--job_type",
             elasticdl_pb2.TRAINING,
             "--minibatch_size",
             self._batch_size,
             "--model_zoo",
             self._model_zoo_path,
             "--model_def",
             self._model_def,
             "--distribution_strategy",
             DistributionStrategy.ALLREDUCE,
         ]
         args = parse_worker_args(arguments)
         worker = Worker(args,
                         max_allreduce_retry_num=max_allreduce_retry_num)
         self._workers.append(worker)
Esempio n. 15
0
def main():
    args = parse_worker_args()
    logger = log_utils.get_logger(__name__)
    master_addr = args.master_addr
    worker_id = int(args.worker_id)

    logger.info("Starting worker %d", worker_id)

    master_client = MasterClient(build_channel(master_addr), worker_id)

    logger.info("Building PS connection....")
    ps_client = (build_ps_client(args.ps_addrs, logger)
                 if args.distribution_strategy
                 == DistributionStrategy.PARAMETER_SERVER else None)

    logger.info("Have builded PS.")

    worker = Worker(
        args,
        master_client=master_client,
        ps_client=ps_client,
        set_parallelism=True,
    )
    worker.run()
Esempio n. 16
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A python bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--get_model_steps",
        get_model_steps,
    ]
    args = parse_worker_args(arguments)
    worker = Worker(args)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__
    checkpoint_service = CheckpointService("", 0, 0, True)
    if training:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            1,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            0,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)
    grads_to_wait = 1 if use_async else 2
    master = MasterServicer(
        grads_to_wait,
        batch_size,
        worker._opt_fn(),
        task_d,
        init_var=[],
        checkpoint_filename_for_init="",
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        use_async=use_async,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]
    worker._stub = InProcessMaster(master, callbacks)

    for var in worker._model.trainable_variables:
        master.set_model_var(var.name, var.numpy())

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.GetTask(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
Esempio n. 17
0
 def _create_worker(self, arguments):
     tf.keras.backend.clear_session()
     tf.random.set_seed(22)
     args = parse_worker_args(arguments)
     return Worker(args)
Esempio n. 18
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = Mock(
        task_d=task_d,
        instance_manager=None,
        distribution_strategy=None,
    )

    def master_creator():
        return MasterServicer(
            batch_size,
            evaluation_service=evaluation_service,
            master=master,
        )

    svc, port = _server(master_creator)
    mc = MasterClient(build_channel("localhost:%d" % port), 1)
    worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels))

    for pservicer in pservers:
        # FIXME(yancey1989): decouple pserver and master client
        pservicer._master_stub = mc

    worker.run()

    task = mc.get_task()
    # stop the master servicer
    svc.stop(0)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return task.model_version
Esempio n. 19
0
    def testMaxCheckpointVersions(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions")
            os.makedirs(chkp_dir)
            # Save checkpoints every 2 steps, and keep 5 checkpoints at most
            checkpointer = CheckpointService(chkp_dir, 2, 5, False)
            self.assertTrue(checkpointer.is_enabled())

            batch_size = 2
            # Launch the training
            arguments = [
                "--worker_id",
                1,
                "--job_type",
                JobType.TRAINING_ONLY,
                "--minibatch_size",
                batch_size,
                "--model_zoo",
                _model_zoo_path,
                "--model_def",
                "test_module.custom_model",
            ]
            args = parse_worker_args(arguments)
            worker = Worker(args)

            filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1)
            task_d = _TaskDispatcher({filename: (0, 128)}, {}, {},
                                     records_per_task=64,
                                     num_epochs=1)
            master = MasterServicer(
                2,
                batch_size,
                worker._opt_fn(),
                task_d,
                init_var=worker._model.trainable_variables,
                checkpoint_filename_for_init="",
                checkpoint_service=checkpointer,
                evaluation_service=None,
            )

            worker._stub = InProcessMaster(master)
            worker.run()

            # We should have 5 checkpoints when the training finishes
            checkpoint_files = sorted(os.listdir(checkpointer._directory))
            self.assertEqual(
                checkpoint_files,
                [
                    "model_v24.chkpt",
                    "model_v26.chkpt",
                    "model_v28.chkpt",
                    "model_v30.chkpt",
                    "model_v32.chkpt",
                ],
            )
            # Latest version should be 32
            self.assertEqual(32, checkpointer.get_latest_checkpoint_version())
            # Check all checkpoints
            for version in [24, 26, 28, 30, 32]:
                model = checkpointer.get_checkpoint_model(version)
                self.assertEqual(version, model.version)
            # Checkpoint not found
            self.assertRaisesRegex(
                RuntimeError,
                "Failed to read model checkpoint from file",
                checkpointer.get_checkpoint_model,
                100,
            )
Esempio n. 20
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    for channel in ps_channels:
        grpc.channel_ready_future(channel).result()
    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)
    worker = Worker(args, ps_channels=ps_channels)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = MasterServicer(
        batch_size,
        task_d,
        evaluation_service=evaluation_service,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]

    in_process_master = InProcessMaster(master, callbacks)
    worker._stub = in_process_master
    for pservicer in pservers:
        pservicer._master_stub = in_process_master

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.get_task(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
Esempio n. 21
0
    def _worker_train(self, train_db, test_db, dataset, stop_step):
        if dataset == "mnist":
            model_def = (
                "mnist_functional_api.mnist_functional_api.custom_model"
            )
        elif dataset == "frappe":
            model_def = (
                "deepfm_functional_api.deepfm_functional_api.custom_model"
            )
        else:
            raise ValueError("dataset %s is not supported", dataset)
        arguments = [
            "--worker_id",
            0,
            "--job_type",
            elasticdl_pb2.TRAINING,
            "--minibatch_size",
            self._batch_size,
            "--model_zoo",
            self._model_zoo_path,
            "--model_def",
            model_def,
            "--distribution_strategy",
            "ParameterServerStrategy",
        ]
        args = parse_worker_args(arguments)

        worker = Worker(args, ps_channels=self._channel)
        acc_meter = tf.keras.metrics.Accuracy()
        worker_results = []
        for step, (x, y) in enumerate(train_db):
            if step == 0:
                worker._run_model_call_before_training(x)

            worker.get_model(step, elasticdl_pb2.MINIMUM)

            w_loss, w_grads = worker.training_process_eagerly(x, y)
            worker.report_gradient(w_grads)

            if step % 20 == 0:
                worker.get_model(step, elasticdl_pb2.MINIMUM)
                for (x, y) in test_db:
                    out = worker.forward_process(x)
                    if dataset == "mnist":
                        acc_meter.update_state(tf.argmax(out, axis=1), y)
                    else:
                        out["probs"] = tf.reshape(out["probs"], [-1])
                        acc_meter.update_state(
                            tf.where(
                                out["probs"] < 0.5,
                                x=tf.zeros_like(y),
                                y=tf.ones_like(y),
                            ),
                            y,
                        )
                worker_results.append(
                    (float(w_loss.numpy()), float(acc_meter.result().numpy()))
                )
                acc_meter.reset_states()

            if step > stop_step:
                break
        return worker_results
Esempio n. 22
0
    def test_compare_onebatch_train(self):
        model_def = "mnist_functional_api.mnist_functional_api.custom_model"
        self._create_pserver(model_def, 2)
        images, labels = get_random_batch(self._batch_size)
        # TODO(yunjian.lmh): test optimizer wrapper
        arguments = [
            "--worker_id",
            0,
            "--job_type",
            elasticdl_pb2.TRAINING,
            "--minibatch_size",
            self._batch_size,
            "--model_zoo",
            self._model_zoo_path,
            "--model_def",
            model_def,
            "--distribution_strategy",
            DistributionStrategy.PARAMETER_SERVER,
        ]
        args = parse_worker_args(arguments)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        worker = Worker(args, ps_channels=self._channels)
        worker._run_model_call_before_training(images)
        worker.get_model()
        w_loss, w_grads = worker.training_process_eagerly(images, labels)
        worker.report_gradient(w_grads)

        tf.keras.backend.clear_session()
        tf.random.set_seed(22)

        (
            model,
            dataset_fn,
            loss_fn,
            opt_fn,
            eval_metrics_fn,
            prediction_outputs_processor,
            create_data_reader_fn,
            callback_list,
        ) = get_model_spec(
            model_zoo=self._model_zoo_path,
            model_def=model_def,
            dataset_fn="dataset_fn",
            model_params=None,
            loss="loss",
            optimizer="optimizer",
            eval_metrics_fn="eval_metrics_fn",
            prediction_outputs_processor="PredictionOutputsProcessor",
            custom_data_reader="custom_data_reader",
            callbacks="callbacks",
        )

        with tf.GradientTape() as tape:
            output = model.call(images, training=True)
            labels = tf.reshape(labels, [-1])
            loss = loss_fn(labels, output)
        grads = tape.gradient(loss, model.trainable_variables)
        opt_fn().apply_gradients(zip(grads, model.trainable_variables))

        for v in model.trainable_variables:
            ps_id = string_to_id(v.name, len(self._channels))
            ps_v = self._pservers[ps_id].parameters.get_non_embedding_param(
                v.name)
            np.testing.assert_array_equal(ps_v.numpy(), v.numpy())
Esempio n. 23
0
    def test_train_acceleration_with_embedding(self):
        kv_store = MockKvStore()
        model_inst = CustomModel()
        master = MasterServicer(
            2,
            2,
            tf.optimizers.SGD(0.1),
            None,
            init_var=model_inst.trainable_variables,
            checkpoint_filename_for_init=None,
            checkpoint_service=None,
            evaluation_service=None,
        )
        arguments = [
            "--worker_id",
            1,
            "--job_type",
            JobType.TRAINING_ONLY,
            "--minibatch_size",
            32,
            "--model_zoo",
            _model_zoo_path,
            "--model_def",
            "embedding_test_module.EdlEmbeddingModel",
        ]
        args = parse_worker_args(arguments)
        worker = Worker(args)
        worker._stub = InProcessMaster(master)

        inputs_list = [
            {
                "f1": tf.constant([[0], [1], [2]], tf.int64),
                "f2": tf.constant([[2], [1], [0]], tf.int64),
            },
            {
                "f1": tf.constant([[3], [4], [3]], tf.int64),
                "f2": tf.constant([[2], [1], [0]], tf.int64),
            },
        ]
        labels_list = [[0, 1, 0], [1, 1, 0]]
        input_dim = 5
        embedding_dim = 16
        worker.set_model(model_inst)

        # initialize kv store
        for layer in model_inst.layers:
            if isinstance(layer, Embedding):
                name = layer.name
                keys = [Embedding.get_key([name, i]) for i in range(input_dim)]
                values = [
                    np.random.rand(embedding_dim).astype(np.float32)
                    for i in range(input_dim)
                ]
                kv_store.update(keys, values)

        with mock.patch.object(
            EmbeddingService, "lookup_embedding", kv_store.lookup
        ), mock.patch.object(
            EmbeddingService, "update_embedding", kv_store.update
        ):
            worker._init_embedding_layer()
            worker._run_model_call_before_training(inputs_list[0])

            # run training process without tf.function
            correct_grads = []
            correct_ids_list = []
            for features, labels in zip(inputs_list, labels_list):
                loss, grads = worker.training_process_eagerly(features, labels)
                correct_grads.append(grads)
                ids = {}
                for layer in worker._embedding_layers:
                    ids[layer.name] = layer.embedding_and_ids[0].batch_ids
                correct_ids_list.append(ids)
                worker._reset_embedding()

            # run training process with tf.function
            test_grads = []
            test_ids_list = []
            for features, labels in zip(inputs_list, labels_list):
                self.assertFalse(worker._train_eagerly)
                loss, grads = worker.training_process(features, labels)
                test_grads.append(grads)
                ids = {}
                for layer in worker._embedding_layers:
                    ids[layer.name] = copy.deepcopy(
                        layer.embedding_and_ids[0].batch_ids
                    )
                test_ids_list.append(ids)
                worker._reset_embedding()

        # compare the gradients
        for test_g, correct_g in zip(test_grads, correct_grads):
            for g1, g2 in zip(test_g, correct_g):
                if isinstance(g1, tf.IndexedSlices):
                    self.assertTrue(np.isclose(g1.values, g2.values).all())
                    self.assertTrue(np.isclose(g1.indices, g2.indices).all())
                else:
                    self.assertTrue(np.isclose(g1, g2).all())

        for test_ids, correct_ids in zip(correct_ids_list, test_ids_list):
            for layer_name in correct_ids.keys():
                self.assertTrue(
                    tf.equal(test_ids[layer_name], correct_ids[layer_name])
                    .numpy()
                    .all()
                )