Example #1
0
 def _init_embedding_layer(self):
     """
     Init elasticdl.layers.embedding layer list and assign worker to them
     """
     self._embedding_layers = find_layer(self._model, Embedding)
     for layer in self._embedding_layers:
         layer.set_endpoint(self._embedding_service_endpoint)
    def _test_correctness(self, optimizer_class, X, Y, seed, **kwargs):
        """Test the correctness of specific TensorFlow optimizer."""
        _model_file = get_module_file_path(
            os.path.dirname(os.path.realpath(__file__)),
            "embedding_test_module.KerasEmbeddingModel",
        )
        model_module = load_module(_model_file).__dict__

        # train model with TensorFlow optimizer
        weights = self._random_init_model_weight(
            [(4, 4), (4, 4), (72, 1), (1,)], seed
        )
        loss_fn = model_module["loss"]
        model1 = model_module["KerasEmbeddingModel"](4, 4, weights)
        opt1 = optimizer_class(**kwargs)
        _train(model1, opt1, X, Y, loss_fn, random_seed=seed)

        model2 = model_module["EdlEmbeddingModel"](4, weights[2:])
        opt2 = optimizer_class(**kwargs)

        layer_names = [layer.name for layer in find_layer(model2, Embedding)]
        embed_dims = dict([(layer_name, 4) for layer_name in layer_names])

        # intialize embedding vectors in kv store
        mock_kv_store = MockKvStore({})
        for layer, embed_table in zip(layer_names, weights[:2]):
            for i, embed_vector in enumerate(embed_table):
                mock_kv_store.update(["%s-%d" % (layer, i)], [embed_vector])

        # train model with optimizer wrapper
        with mock.patch.object(
            EmbeddingService, "lookup_embedding", mock_kv_store.lookup
        ), mock.patch.object(
            EmbeddingService, "update_embedding", mock_kv_store.update
        ):
            _train_edl_embedding_with_optimizer_wrapper(
                model2, opt2, X, Y, loss_fn, embed_dims, random_seed=seed
            )

        # compare trained parameters
        wrong_msg = (
            "The updated parameters of Optimizer Wrapper and TensorFlow "
            "optimizer %s differ." % opt1.get_config()["name"]
        )

        for layer1, layer2 in zip(model1.layers, model2.layers):
            if "embedding" in layer2.name:
                w1 = layer1.weights[0].numpy()
                keys = [Embedding.get_key([layer2.name, i]) for i in range(4)]
                w2 = np.concatenate(mock_kv_store.lookup(keys)[0]).reshape(
                    4, -1
                )
                self.assertTrue((w1 - w2 < 0.0001).all(), msg=wrong_msg)
            else:
                for w1, w2 in zip(layer1.weights, layer2.weights):
                    self.assertTrue(
                        (w1 - w2 < 0.0001).numpy().all(), msg=wrong_msg
                    )
Example #3
0
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y,
                                                loss_fn, embed_dims,
                                                random_seed):
    """Train model with optimizer wrapper."""
    tf.random.set_seed(random_seed)
    optimizer = OptimizerWrapper(opt_keras, None, embed_dims)

    # initialization process related to embedding layer and optimizer wrapper
    embed_layers = find_layer(model, Embedding)

    # training process
    for train_iter, (features, labels) in enumerate(zip(X, Y)):
        with tf.GradientTape() as tape:
            for layer in embed_layers:
                layer.set_tape(tape)
            outputs = model.call(features)
            loss = loss_fn(outputs, labels)

        # Need to get non-embedding variables inside for loop because model
        # creates variables after the first time `model.call` is called
        if not train_iter:
            non_embed_vars = get_non_embedding_trainable_vars(
                model, embed_layers)
        embed_items = []
        for layer in embed_layers:
            embed_items.extend([(bet, layer.name, ids)
                                for bet, ids in layer.bet_ids_pair])

        grads = tape.gradient(
            loss, non_embed_vars + [var for var, _, _ in embed_items])

        # TODO: do not need to merge gradient from the same embedding layer
        # after `optimizer_wrapper` support grads_and_vars with duplicated
        # layer name
        non_embed_vars_n = len(non_embed_vars)
        non_embed_grads = grads[:non_embed_vars_n]
        embed_grads_dict = {}
        for (_, layer_name, ids), grad in zip(embed_items,
                                              grads[non_embed_vars_n:]):
            if layer_name in embed_grads_dict:
                merged_grads = embed_grads_dict[layer_name]
                embed_grads_dict[layer_name] = tf.IndexedSlices(
                    tf.concat([merged_grads.values, grad.values], axis=0),
                    tf.concat([merged_grads.indices, ids], axis=0),
                )
            else:
                embed_grads_dict[layer_name] = tf.IndexedSlices(
                    grad.values, ids)

        optimizer.apply_gradients(
            list(zip(non_embed_grads, non_embed_vars)) +
            [(grad, layer_name)
             for layer_name, grad in embed_grads_dict.items()])

        for layer in embed_layers:
            layer.reset()
Example #4
0
 def _init_embedding_layer(self):
     """
     Init elasticdl.layers.embedding layer list and assign worker to them
     """
     self._embedding_layers = find_layer(self._model, Embedding)
     for layer in self._embedding_layers:
         layer.set_lookup_func(self.lookup_embedding)
     if self._embedding_layers:
         # TODO check that Redis IP/PORT is set
         pass
Example #5
0
    def test_find_layer_nested(self):
        model_def = "resnet50_subclass.resnet50_subclass.CustomModel"
        model = _create_model_instance(model_def)

        layer_num = {
            tf.keras.layers.Conv2D: 53,
            tf.keras.layers.Activation: 50,
            tf.keras.layers.Embedding: 0,
        }

        for layer_class in layer_num:
            layers = find_layer(model, layer_class)
            self.assertEqual(layer_num[layer_class], len(layers))
Example #6
0
    def test_find_layer(self):
        model_def = "mnist_functional_api.mnist_functional_api.custom_model"
        model = _create_model_instance(model_def)

        layer_num = {
            tf.keras.layers.Conv2D: 2,
            tf.keras.layers.Dropout: 1,
            tf.keras.layers.Embedding: 0,
        }

        for layer_class in layer_num:
            layers = find_layer(model, layer_class)
            self.assertEqual(layer_num[layer_class], len(layers))
Example #7
0
def main():
    args = parse_args()
    logger = get_logger("master", level=args.log_level.upper())

    # Master addr
    master_ip = os.getenv("MY_POD_IP", "localhost")
    master_addr = "%s:%d" % (master_ip, args.port)

    # Start TensorBoard service if requested
    if args.tensorboard_log_dir:
        logger.info(
            "Starting TensorBoard service with log directory %s",
            args.tensorboard_log_dir,
        )
        # Start TensorBoard CLI
        tb_service = TensorboardService(args.tensorboard_log_dir, master_ip)
        tb_service.start()
    else:
        tb_service = None

    # Start task queue
    logger.debug(
        "Starting task queue with training data directory %s, "
        "evaluation data directory %s, "
        "and prediction data directory %s",
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
    )
    task_d = _make_task_dispatcher(
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
        args.records_per_task,
        args.num_epochs,
    )
    model_module = load_module(
        get_module_file_path(args.model_zoo, args.model_def)).__dict__
    model_inst = load_model_from_module(args.model_def, model_module,
                                        args.model_params)
    optimizer = model_module[args.optimizer]()

    if all((
            args.training_data_dir,
            args.evaluation_data_dir,
            args.evaluation_throttle_secs or args.evaluation_steps,
    )):
        job_type = JobType.TRAINING_WITH_EVALUATION
    elif all((
            args.evaluation_data_dir,
            not args.training_data_dir,
            not args.prediction_data_dir,
    )):
        job_type = JobType.EVALUATION_ONLY
    elif all((
            args.prediction_data_dir,
            not args.evaluation_data_dir,
            not args.training_data_dir,
    )):
        job_type = JobType.PREDICTION_ONLY
    else:
        job_type = JobType.TRAINING_ONLY

    # Initialize checkpoint service
    if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION:
        logger.info("Starting checkpoint service")
        checkpoint_service = CheckpointService(
            args.checkpoint_dir,
            args.checkpoint_steps,
            args.keep_checkpoint_max,
            job_type == JobType.TRAINING_WITH_EVALUATION,
        )
    else:
        checkpoint_service = None

    # Initialize evaluation service
    evaluation_service = None
    if (job_type == JobType.TRAINING_WITH_EVALUATION
            or job_type == JobType.EVALUATION_ONLY):
        logger.info(
            "Starting evaluation service with throttle seconds %d "
            " and evaluation steps %d",
            args.evaluation_throttle_secs,
            args.evaluation_steps,
        )
        evaluation_service = EvaluationService(
            checkpoint_service,
            tb_service,
            task_d,
            args.evaluation_start_delay_secs,
            args.evaluation_throttle_secs,
            args.evaluation_steps,
            job_type == JobType.EVALUATION_ONLY,
        )
        evaluation_service.start()
        task_d.set_evaluation_service(evaluation_service)

    embedding_service_endpoint = None
    embedding_dims = {}
    # Search for embedding layers in the model,
    # if found, initialize embedding service
    layers = find_layer(model_inst, Embedding)
    if layers:
        embedding_service = EmbeddingService()
        embedding_service_endpoint = embedding_service.start_embedding_service(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
            resource_request=args.master_resource_request,
            resource_limit=args.master_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
        )
        logger.info("Embedding service start succeeded. The endpoint is %s." %
                    str(embedding_service_endpoint))
        embedding_dims = dict([(layer.name, layer.output_dim)
                               for layer in layers])

    # The master service
    logger.info("Starting master service")
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=64),
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )
    master_servicer = MasterServicer(
        args.grads_to_wait,
        args.minibatch_size,
        optimizer,
        task_d,
        init_var=model_inst.trainable_variables if model_inst.built else [],
        embedding_dims=embedding_dims,
        checkpoint_filename_for_init=args.checkpoint_filename_for_init,
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        embedding_service_endpoint=embedding_service_endpoint,
        lr_staleness_modulation=args.lr_staleness_modulation,
        use_async=args.use_async,
    )
    elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server)
    server.add_insecure_port("[::]:{}".format(args.port))
    server.start()
    logger.info("Server started at port: %d", args.port)

    worker_manager = None
    if args.num_workers:
        assert args.worker_image, "Worker image cannot be empty"

        worker_command = ["python"]
        worker_args = [
            "-m",
            "elasticdl.python.worker.main",
            "--model_zoo",
            args.model_zoo,
            "--master_addr",
            master_addr,
            "--log_level",
            args.log_level,
            "--dataset_fn",
            args.dataset_fn,
            "--loss",
            args.loss,
            "--optimizer",
            args.optimizer,
            "--eval_metrics_fn",
            args.eval_metrics_fn,
            "--model_def",
            args.model_def,
            "--job_type",
            job_type,
            "--minibatch_size",
            str(args.minibatch_size),
            "--embedding_service_endpoint",
            str(embedding_service_endpoint),
            "--get_model_steps",
            str(args.get_model_steps),
        ]

        env_dict = parse_envs(args.envs)
        env = []
        for key in env_dict:
            env.append(V1EnvVar(name=key, value=env_dict[key]))

        worker_manager = WorkerManager(
            task_d,
            job_name=args.job_name,
            image_name=args.worker_image,
            command=worker_command,
            args=worker_args,
            namespace=args.namespace,
            num_workers=args.num_workers,
            worker_resource_request=args.worker_resource_request,
            worker_resource_limit=args.worker_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
            envs=env,
        )
        worker_manager.update_status(WorkerManagerStatus.PENDING)
        logger.info("Launching %d workers", args.num_workers)
        worker_manager.start_workers()
        worker_manager.update_status(WorkerManagerStatus.RUNNING)

    # Start TensorBoard k8s Service if requested
    if tb_service:
        TensorBoardClient(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
        ).start_tensorboard_service()

    try:
        while True:
            if task_d.finished():
                if worker_manager:
                    worker_manager.update_status(WorkerManagerStatus.FINISHED)
                if args.output:
                    master_servicer.save_latest_checkpoint(args.output)
                break
            time.sleep(30)
    except KeyboardInterrupt:
        logger.warning("Server stopping")

    if evaluation_service:
        logger.info("Stopping evaluation service")
        evaluation_service.stop()

    logger.info("Stopping RPC server")
    server.stop(0)

    # Keep TensorBoard running when all the tasks are finished
    if tb_service:
        logger.info(
            "All tasks finished. Keeping TensorBoard service running...")
        while True:
            if tb_service.is_active():
                time.sleep(10)
            else:
                logger.warning("Unable to keep TensorBoard running. "
                               "It has already terminated")
                break
    logger.info("Master stopped")
def _train_edl_embedding_with_optimizer_wrapper(
    model, opt_keras, X, Y, loss_fn, embed_dims, random_seed
):
    """Train model with optimizer wrapper."""
    tf.random.set_seed(random_seed)
    optimizer = OptimizerWrapper(opt_keras, None, embed_dims)

    # initialization process related to embedding layer and optimizer wrapper
    embed_layers = find_layer(model, Embedding)

    def lookup_func(ids, layer_name, initializer, output_dim):
        values, unknown = EmbeddingService.lookup_embedding(
            [Embedding.get_key([layer_name, i]) for i in ids]
        )
        return np.concatenate(values).reshape(len(ids), -1)

    for layer in embed_layers:
        layer.set_lookup_func(lookup_func)

    # training process
    for features, labels in zip(X, Y):
        with tf.GradientTape() as tape:
            for layer in embed_layers:
                layer.set_tape(tape)
            outputs = model.call(features)
            loss = loss_fn(outputs, labels)

        # TODO: calculate train_vars_embed and train_vars_other can be a
        # reusable function
        train_vars_embed = []
        train_vars_other = []
        for layer in model.layers:
            if isinstance(layer, Embedding):
                for bet, ids in layer.bet_ids_pair:
                    train_vars_embed.append((bet, layer.name, ids))
            else:
                vars = layer.trainable_variables
                train_vars_other.extend(vars)

        grads = tape.gradient(
            loss, train_vars_other + [var for var, _, _ in train_vars_embed]
        )

        # TODO: do not need to merge gradient from the same embedding layer
        # after `optimizer_wrapper` support grads_and_vars with duplicated
        # layer name
        train_vars_other_len = len(train_vars_other)
        grads_new = grads[:train_vars_other_len]
        grads_embed_dict = {}
        for (_, layer_name, ids), grad in zip(
            train_vars_embed, grads[train_vars_other_len:]
        ):
            if layer_name in grads_embed_dict:
                grads_merged = grads_embed_dict[layer_name]
                grads_embed_dict[layer_name] = tf.IndexedSlices(
                    tf.concat([grads_merged.values, grad.values], axis=0),
                    tf.concat([grads_merged.indices, ids], axis=0),
                )
            else:
                grads_embed_dict[layer_name] = tf.IndexedSlices(
                    grad.values, ids
                )

        optimizer.apply_gradients(
            list(zip(grads_new, train_vars_other))
            + [
                (grad, layer_name)
                for layer_name, grad in grads_embed_dict.items()
            ]
        )

        for layer in embed_layers:
            layer.reset()