def test_odps_data_reader_integration_with_local_keras(self): num_records = 2 model_spec = load_module( os.path.join( os.path.dirname(os.path.realpath(__file__)), "odps_test_module.py", )).__dict__ model = model_spec["custom_model"]() optimizer = model_spec["optimizer"]() loss = model_spec["loss"] dataset_fn = model_spec["dataset_fn"] def _gen(): for data in self.reader.read_records( _MockedTask(0, num_records, "shard_0")): if data is not None: yield data dataset = tf.data.Dataset.from_generator(_gen, (tf.float32)) dataset = dataset_fn(dataset, None) loss_history = [] grads = None for features, labels in dataset: with tf.GradientTape() as tape: logits = model(features, training=True) loss_value = loss(logits, labels) loss_history.append(loss_value.numpy()) grads = tape.gradient(loss_value, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) self.assertEqual(len(loss_history), num_records) self.assertEqual(len(grads), num_records) self.assertEqual(len(model.trainable_variables), num_records)
def _test_correctness(self, optimizer_class, X, Y, seed, **kwargs): """Test the correctness of specific TensorFlow optimizer.""" _model_file = get_module_file_path( os.path.dirname(os.path.realpath(__file__)), "embedding_test_module.KerasEmbeddingModel", ) model_module = load_module(_model_file).__dict__ # train model with TensorFlow optimizer weights = self._random_init_model_weight( [(4, 4), (4, 4), (72, 1), (1,)], seed ) loss_fn = model_module["loss"] model1 = model_module["KerasEmbeddingModel"](4, 4, weights) opt1 = optimizer_class(**kwargs) _train(model1, opt1, X, Y, loss_fn, random_seed=seed) model2 = model_module["EdlEmbeddingModel"](4, weights[2:]) opt2 = optimizer_class(**kwargs) layer_names = [layer.name for layer in find_layer(model2, Embedding)] embed_dims = dict([(layer_name, 4) for layer_name in layer_names]) # intialize embedding vectors in kv store mock_kv_store = MockKvStore({}) for layer, embed_table in zip(layer_names, weights[:2]): for i, embed_vector in enumerate(embed_table): mock_kv_store.update(["%s-%d" % (layer, i)], [embed_vector]) # train model with optimizer wrapper with mock.patch.object( EmbeddingService, "lookup_embedding", mock_kv_store.lookup ), mock.patch.object( EmbeddingService, "update_embedding", mock_kv_store.update ): _train_edl_embedding_with_optimizer_wrapper( model2, opt2, X, Y, loss_fn, embed_dims, random_seed=seed ) # compare trained parameters wrong_msg = ( "The updated parameters of Optimizer Wrapper and TensorFlow " "optimizer %s differ." % opt1.get_config()["name"] ) for layer1, layer2 in zip(model1.layers, model2.layers): if "embedding" in layer2.name: w1 = layer1.weights[0].numpy() keys = [Embedding.get_key([layer2.name, i]) for i in range(4)] w2 = np.concatenate(mock_kv_store.lookup(keys)[0]).reshape( 4, -1 ) self.assertTrue((w1 - w2 < 0.0001).all(), msg=wrong_msg) else: for w1, w2 in zip(layer1.weights, layer2.weights): self.assertTrue( (w1 - w2 < 0.0001).numpy().all(), msg=wrong_msg )
def main(): parser = argparse.ArgumentParser( description="Spark job to convert training data to RecordIO format") parser.add_argument( "--training_data_tar_file", help="Tar file that contains all training data", required=True, ) parser.add_argument("--output_dir", help="Directory of output RecordIO data", required=True) parser.add_argument( "--model_file", required=True, help="User-defined model file which data processing logic is in", ) parser.add_argument("--records_per_file", default=1024, type=int, help="Record per file") parser.add_argument( "--num_workers", default=2, type=int, help="Number of workers of Spark job", ) args = parser.parse_args() # Get training data file names from training_data_tar_file tar = tarfile.open(args.training_data_tar_file) tar_info_list = tar.getmembers() filename_list = [] for tar_info in tar_info_list: f = tar.extractfile(tar_info) if f is not None and not tar_info.name.split("/")[-1].startswith("."): filename_list.append(tar_info.name) # Load user-defined model model_module = load_module(args.model_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Start the Spark job sc = SparkContext() rdd = sc.parallelize(filename_list, args.num_workers) rdd.mapPartitions( process_data( model_module.prepare_data_for_a_single_file, args.training_data_tar_file, args.output_dir, args.records_per_file, )).collect()
def __init__(self, *, image_name, namespace, job_name, event_callback, cluster_spec=""): """ ElasticDL k8s client. Args: image_name: Docker image path for ElasticDL pod. namespace: The name of the Kubernetes namespace where ElasticDL pods will be created. job_name: ElasticDL job name, should be unique in the namespace. Used as pod name prefix and value for "elasticdl" label. event_callback: If not None, an event watcher will be created and events passed to the callback. """ if os.getenv("KUBERNETES_SERVICE_HOST"): # We are running inside k8s config.load_incluster_config() else: # Use user's kube config config.load_kube_config() self.client = client.CoreV1Api() self.namespace = namespace self.job_name = job_name self._image_name = image_name self._event_cb = event_callback if self._event_cb: threading.Thread(target=self._watch, name="event_watcher", daemon=True).start() self.cluster = None if cluster_spec: cluster_spec_module = load_module(cluster_spec) self.cluster = cluster_spec_module.cluster
def main(): args = parse_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, args.records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ model_inst = load_model_from_module(args.model_def, model_module, args.model_params) optimizer = model_module[args.optimizer]() if all(( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, )): job_type = JobType.TRAINING_WITH_EVALUATION elif all(( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, )): job_type = JobType.EVALUATION_ONLY elif all(( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, )): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if (job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info("Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint)) embedding_dims = dict([(layer.name, layer.output_dim) for layer in layers]) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--model_zoo", args.model_zoo, "--master_addr", master_addr, "--log_level", args.log_level, "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--job_type", job_type, "--minibatch_size", str(args.minibatch_size), "--embedding_service_endpoint", str(embedding_service_endpoint), "--get_model_steps", str(args.get_model_steps), ] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running...") while True: if tb_service.is_active(): time.sleep(10) else: logger.warning("Unable to keep TensorBoard running. " "It has already terminated") break logger.info("Master stopped")
def _create_model_instance(model_def): module_file = get_module_file_path(_get_model_zoo_path(), model_def) model_module = load_module(module_file).__dict__ return load_model_from_module(model_def, model_module, None)
from elasticdl.proto import elasticdl_pb2 from elasticdl.python.common.constants import JobType from elasticdl.python.common.model_helper import ( get_module_file_path, load_module, ) from elasticdl.python.master.checkpoint_service import CheckpointService from elasticdl.python.master.servicer import MasterServicer from elasticdl.python.master.task_dispatcher import _TaskDispatcher from elasticdl.python.tests.in_process_master import InProcessMaster from elasticdl.python.worker.worker import Worker _model_zoo_path = os.path.dirname(os.path.realpath(__file__)) _model_file = get_module_file_path(_model_zoo_path, "test_module.custom_model") m = load_module(_model_file).__dict__ def create_recordio_file(size): temp_file = tempfile.NamedTemporaryFile(delete=False) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): x = np.random.rand(1).astype(np.float32) y = 2 * x + 1 example_dict = { "x": tf.train.Feature(float_list=tf.train.FloatList(value=x)), "y": tf.train.Feature(float_list=tf.train.FloatList(value=y)), } example = tf.train.Example( features=tf.train.Features(feature=example_dict) )