def testEvaluationOnly(self): task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1) evaluation_service = EvaluationService( None, None, task_d, 0, 0, 0, True ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) self.assertEqual(8, len(task_d._todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def testEvaluationOnly(self): task_d = create_task_manager([], [("f1", 0, 10), ("f2", 0, 10)]) task_d.create_tasks(elasticai_api_pb2.EVALUATION) evaluation_service = EvaluationService(task_d.create_evaluation_tasks, 0, True, _eval_metrics_fn) task_d.set_evaluation_service(evaluation_service) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) _ = MasterServicer( master.task_d, master.instance_manager, None, evaluation_service, ) self.assertEqual(8, len(task_d._eval_todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def testEvaluationService(self): task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( None, task_d, 10, 20, 0, False, _eval_metrics_fn, ) _ = MasterServicer( 2, task_d, evaluation_service=evaluation_service, master=None ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(False) self.assertEqual(8, len(task_d._eval_todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testEvaluationService(self): task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [("f1", 0, 10), ("f2", 0, 10)]) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( task_d.create_evaluation_tasks, 0, False, _eval_metrics_fn, ) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) _ = MasterServicer(master.task_d, master.instance_manager, None, evaluation_service) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(False) self.assertEqual(8, len(task_d._eval_todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False ) evaluation_metrics = { "mse": ndarray_to_tensor( np.array([100, 200], dtype=np.float32) ) } self.assertFalse( evaluation_service.report_evaluation_metrics( 1, evaluation_metrics ) ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(0) self.assertEqual(16, len(task_d._todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testEvaluationOnly(self): task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1) evaluation_service = EvaluationService( None, task_d, 0, 0, 0, True, _eval_metrics_fn ) task_d.set_evaluation_service(evaluation_service) _ = MasterServicer( 2, task_d, evaluation_service=evaluation_service, master=None, ) self.assertEqual(8, len(task_d._eval_todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def _create_evaluation_service(self, eval_func, evaluation_steps): evaluation_service = None if (self.job_type == JobType.TRAINING_WITH_EVALUATION or self.job_type == JobType.EVALUATION_ONLY): self.logger.info( "Creating evaluation service with " "evaluation steps %d", evaluation_steps, ) evaluation_service = EvaluationService( self.task_manager.create_evaluation_tasks, evaluation_steps, self.job_type == JobType.EVALUATION_ONLY, eval_func, ) self.task_manager.set_evaluation_service(evaluation_service) return evaluation_service
def _create_evaluation_service(self, args): evaluation_service = None if ( self.job_type == JobType.TRAINING_WITH_EVALUATION or self.job_type == JobType.EVALUATION_ONLY ): self.logger.info( "Creating evaluation service with " "evaluation steps %d", args.evaluation_steps, ) evaluation_service = EvaluationService( self.tb_service, self.task_d, args.evaluation_steps, self.job_type == JobType.EVALUATION_ONLY, self.model_module[args.eval_metrics_fn], ) self.task_d.set_evaluation_service(evaluation_service) return evaluation_service
def _create_evaluation_service(self, args): evaluation_service = None if (self.job_type == JobType.TRAINING_WITH_EVALUATION or self.job_type == JobType.EVALUATION_ONLY): self.logger.info( "Creating evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( self.checkpoint_service, self.tb_service, self.task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, self.job_type == JobType.EVALUATION_ONLY, self.model_module[args.eval_metrics_fn], ) self.task_d.set_evaluation_service(evaluation_service) return evaluation_service
def main(): args = parse_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, args.records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ model_inst = load_model_from_module(args.model_def, model_module, args.model_params) optimizer = model_module[args.optimizer]() if all(( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, )): job_type = JobType.TRAINING_WITH_EVALUATION elif all(( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, )): job_type = JobType.EVALUATION_ONLY elif all(( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, )): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if (job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info("Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint)) embedding_dims = dict([(layer.name, layer.output_dim) for layer in layers]) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--model_zoo", args.model_zoo, "--master_addr", master_addr, "--log_level", args.log_level, "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--job_type", job_type, "--minibatch_size", str(args.minibatch_size), "--embedding_service_endpoint", str(embedding_service_endpoint), "--get_model_steps", str(args.get_model_steps), ] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running...") while True: if tb_service.is_active(): time.sleep(10) else: logger.warning("Unable to keep TensorBoard running. " "It has already terminated") break logger.info("Master stopped")
def testNeedEvaluation(self): task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) evaluation_service = EvaluationService( None, task_d, 10, 0, 10, False, _eval_metrics_fn, ) # Should add evaluation task and create eval job evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=10 ) self.assertTrue(evaluation_service._eval_job is not None) self.assertEqual(evaluation_service._eval_checkpoint_versions, []) # Should ignore because version 10 is in the eval list evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=10 ) self.assertEqual(evaluation_service._eval_checkpoint_versions, []) # Should append version 20 to the eval list evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=20 ) self.assertEqual(evaluation_service._eval_checkpoint_versions, [20]) # Should ignore version 10 because version 20 is already in eval list evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=10 ) self.assertEqual(evaluation_service._eval_checkpoint_versions, [20]) # Should append version 30 to the eval list evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=30 ) self.assertEqual( evaluation_service._eval_checkpoint_versions, [20, 30] )
def testNeedEvaluation(self): task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [("f1", 0, 10), ("f2", 0, 10)]) evaluation_service = EvaluationService( task_d.create_evaluation_tasks, 10, False, _eval_metrics_fn, ) # Should add evaluation task and create eval job evaluation_service.add_evaluation_task_if_needed(model_version=10) self.assertTrue(evaluation_service._eval_job is not None) self.assertEqual(evaluation_service._eval_checkpoint_versions, []) # Should ignore because version 10 is in the eval list evaluation_service.add_evaluation_task_if_needed(model_version=10) self.assertEqual(evaluation_service._eval_checkpoint_versions, []) # Should append version 20 to the eval list evaluation_service.add_evaluation_task_if_needed(model_version=20) self.assertEqual(evaluation_service._eval_checkpoint_versions, [20]) # Should ignore version 10 because version 20 is already in eval list evaluation_service.add_evaluation_task_if_needed(model_version=10) self.assertEqual(evaluation_service._eval_checkpoint_versions, [20]) # Should append version 30 to the eval list evaluation_service.add_evaluation_task_if_needed(model_version=30) self.assertEqual(evaluation_service._eval_checkpoint_versions, [20, 30])
def distributed_train_and_evaluate( self, feature_shape, model_def, model_params="", training=True, dataset="", ): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ job_type = (JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def=model_def, model_params=model_params, channel=None, ) if dataset == "imagenet": batch_size = 8 shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)} elif dataset == "frappe": shards = { create_frappe_recordio_file(16, feature_shape, 5383): (0, 16) } else: shards = {create_recordio_file(128, feature_shape): (0, 128)} if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) # Initialize checkpoint service checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 1, False) else: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 0, True) task_d.set_evaluation_service(evaluation_service) # The master service master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) worker._stub = InProcessMaster(master) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)
def distributed_train_and_evaluate(self, training=True): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ class _Master(InProcessMaster): def ReportGradient(self, req): if 2 < self._m._version < 80: # For testing of retrain when gradient not accepted. # Increase master version to reject the gradient. self._m._version += 1 return self._m.ReportGradient(req, None) def ReportEvaluationMetrics(self, req): if 2 < self._m._version < 80: # Testing of evaluation retries. Increase the master # version so the evaluation metrics will not be accepted. self._m._version += 1 return self._m.ReportEvaluationMetrics(req, None) job_type = (JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def="test_module.custom_model", channel=None, ) shards = {create_recordio_file(128): 128} if training: training_shards = shards evaluation_shards = {} else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if not training: evaluation_service = EvaluationService(None, None, task_d, 0, 0, 0, True) task_d.set_evaluation_service(evaluation_service) else: evaluation_service = None master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, ) worker._stub = _Master(master) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) def master_creator(): return MasterServicer( batch_size, evaluation_service=evaluation_service, master=master, ) svc, port = _server(master_creator) mc = MasterClient(build_channel("localhost:%d" % port), 1) worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels)) for pservicer in pservers: # FIXME(yancey1989): decouple pserver and master client pservicer._master_stub = mc worker.run() task = mc.get_task() # stop the master servicer svc.stop(0) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return task.model_version
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ for channel in ps_channels: grpc.channel_ready_future(channel).result() worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) worker = Worker(args, ps_channels=ps_channels) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( batch_size, task_d, evaluation_service=evaluation_service, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] in_process_master = InProcessMaster(master, callbacks) worker._stub = in_process_master for pservicer in pservers: pservicer._master_stub = in_process_master worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.get_task(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def distributed_train_and_evaluate( self, training=True, callback_classes=[], use_async=False, grads_to_wait=2, get_model_steps=1, ): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ if use_async and grads_to_wait > 1: raise ValueError( "grads_to_wait should be 1 when using asynchronous SGD." ) job_type = ( JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY ) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def="test_module.custom_model", channel=None, get_model_steps=get_model_steps, ) shards = {create_recordio_file(128): (0, 128)} if training: training_shards = shards evaluation_shards = {} else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if not training: evaluation_service = EvaluationService( None, None, task_d, 0, 0, 0, True ) task_d.set_evaluation_service(evaluation_service) else: evaluation_service = None master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker, self) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)