def learn(iterations=100): current_best_val = load_model("../models/alphazerolike/azval.tf") current_best_pol = load_model("../models/alphazerolike/azpol.tf") candidate_val = clone_model(current_best_val) candidate_pol = clone_model(current_best_pol) _provide_new_training_data((candidate_val, candidate_pol)) for _ in tqdm(range(iterations)): candidate_val = clone_model(current_best_val) candidate_pol = clone_model(current_best_pol) _training(candidate_val, candidate_pol) if _compete((current_best_val, current_best_pol), (candidate_val, candidate_pol)): save_model(candidate_val, "../models/alphazerolike/azval.tf") save_model(candidate_pol, "../models/alphazerolike/azpol.tf") _provide_new_training_data((candidate_val, candidate_pol)) save_model( candidate_val, "../models/alphazerolike/history/" + datetime.datetime.now().strftime("%Y_%m_%d%H_%M")) save_model( candidate_pol, "../models/alphazerolike/history/" + datetime.datetime.now().strftime("%Y_%m_%d%H_%M")) current_best_pol = candidate_pol current_best_val = candidate_val
def _clone_and_build_model(model, strategy): # The new "original" model in worker 0. with strategy.scope(): cloned_model = models.clone_model(model) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer # TODO(yuefengz): figure out why the optimizer here is still a # TFOptimizer. while isinstance(optimizer, optimizers.TFOptimizer): optimizer = optimizer.optimizer optimizer = copy.deepcopy(optimizer) else: optimizer_config = model.optimizer.get_config() optimizer = type(model.optimizer).from_config(optimizer_config) cloned_model.compile(optimizer, model.loss, metrics=metrics_module.clone_metrics( model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics)) return cloned_model
def _clone_and_build_model(model, inputs=None, targets=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) if isinstance(targets, tuple): targets = nest.flatten(targets) cloned_model.compile( optimizer, model.loss, metrics=metrics_module.clone_metrics(model.metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics), target_tensors=targets) return cloned_model
def _clone_and_build_model(model, inputs=None, targets=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) # TODO(priyag): Is there a cleaner way to do this? The API doc suggests a # single tensor should be OK but it throws an error in that case. if (targets is not None and not isinstance(targets, list) and not isinstance(targets, dict)): targets = [targets] cloned_model.compile( optimizer, model.loss, metrics=model.metrics, loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=model.weighted_metrics, target_tensors=targets) return cloned_model
def test_uncompiled_prebuilt_model_raises_error(self): """Tests that an uncompiled model cannot be used as build_fn param.""" for config in [ "MLPRegressor", "MLPClassifier", "CNNClassifier", "CNNClassifierF", ]: loader, model, build_fn, _ = CONFIG[config] data = loader() x_train, y_train = data.data[:100], data.target[:100] n_classes_ = np.unique(y_train).size # make y the same shape as will be used by .fit if config != "MLPRegressor": y_train = to_categorical(y_train) keras_model = build_fn( X=x_train, n_classes_=n_classes_, n_outputs_=1 ) else: keras_model = build_fn(X=x_train, n_outputs_=1) # clone to simulate uncompiled model keras_model = clone_model(keras_model) estimator = model(build_fn=keras_model) with pytest.raises(ValueError): check(estimator, loader)
def _clone_and_build_model(model, inputs=None, targets=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) # TODO(priyag): Is there a cleaner way to do this? The API doc suggests a # single tensor should be OK but it throws an error in that case. if (targets is not None and not isinstance(targets, list) and not isinstance(targets, dict)): targets = [targets] cloned_model.compile(optimizer, model.loss, metrics=model.metrics, loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=model.weighted_metrics, target_tensors=targets) return cloned_model
def test_gradient_tape_doesnt_crash_when_model_has_non_trainable_variables(self): # Given initial_model = Sequential([ tf.keras.layers.Input((1,)), Dense(3), BatchNormalization(), Dense(7) ]) initial_weights = initial_model.get_weights() x = np.array([[1]]) # When updated_model = clone_model(initial_model) take_n_gradient_step( initial_model, updated_model, n_step=1, alpha=1.0, loss=(lambda y, p: p), data_x=x, data_y=x ) # Then np.testing.assert_equal(initial_weights[4], updated_model.get_weights()[4]) # Moving mean np.testing.assert_equal(initial_weights[5], updated_model.get_weights()[5]) # Moving Variance
def test_2nd_order_gradient_through_updated_model(self): # Given initial_model = Sequential([ Dense(1, use_bias=False, kernel_initializer='ones', input_shape=(1,)), Lambda(lambda x: x ** 2) ]) x = np.array([[3]]) updated_model = clone_model(initial_model) # When with tf.GradientTape() as outer_tape: take_n_gradient_step( initial_model, updated_model, n_step=1, alpha=1.0, loss=(lambda y, p: p), data_x=x, data_y=x ) yp = updated_model(x) grad_of_grads = outer_tape.gradient(yp, initial_model.trainable_variables) # Then self.assertEqual(5202, grad_of_grads[0])
def _clone_and_build_model(model, inputs=None, targets=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) if isinstance(targets, tuple): targets = nest.flatten(targets) cloned_model.compile(optimizer, model.loss, metrics=metrics_module.clone_metrics(model.metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model.weighted_metrics), target_tensors=targets) return cloned_model
def _clone_and_build_model(model, strategy): # The new "original" model in worker 0. with strategy.scope(): cloned_model = models.clone_model(model) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer # TODO(yuefengz): figure out why the optimizer here is still a # TFOptimizer. while isinstance(optimizer, optimizers.TFOptimizer): optimizer = optimizer.optimizer optimizer = copy.deepcopy(optimizer) else: optimizer_config = model.optimizer.get_config() optimizer = type(model.optimizer).from_config(optimizer_config) cloned_model.compile( optimizer, model.loss, metrics=metrics_module.clone_metrics(model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics)) return cloned_model
def __init__(self, transitions_seen_between_updates=10000, *args, **kwargs): super().__init__(*args, **kwargs) self.target_model = clone_model(self.online_model) self.n_model_updates = 0 self.transitions_seen_between_updates = transitions_seen_between_updates
def on_epoch_end(self, epoch, logs=None): acc = logs["val_accuracy"] if acc < min(self.top): return for i in range(self.m): if acc >= self.top[i] and epoch not in self.numbers: for j in range(self.m - 2, -1, -1): self.top[j + 1] = self.top[j] self.numbers[j + 1] = self.numbers[j] if self.models[j]: self.models[j + 1] = clone_model(self.models[j]) self.top[i] = acc self.models[i] = clone_model(self.model) self.numbers[i] = epoch break
def __init__(self, config, network, logger, start_epoch=0): self.discount_factor = config['discount_factor'] self.metrics = config['metrics'] self.loss = config['loss'] self.num_actions = config['num_actions'] self.train_model = network self.logger = logger self.start_epoch = start_epoch self.weights = None self.optimizer = self.create_optimizer(config) self.target_model = clone_model(self.train_model) self.compile_train_model() self.logger.set_start_epoch(start_epoch) self.tensor_board = self.logger.get_tensor_board() self.tensor_board.set_model(self.train_model)
def test_take_5_gradient_steps(self): # Given model = Sequential([ tf.keras.layers.Input((1,)), Dense(1, use_bias=False, kernel_initializer='ones'), ]) updated_model = clone_model(model) x = np.array([[1]]) y = np.array([[4]]) # When n_step = 5 alpha = 1.0 take_n_gradient_step(model, updated_model, n_step, alpha, tf.keras.losses.mse, x, y) # Then self.assertIsNotNone(updated_model(x))
def _clone_and_build_model(model, inputs=None, targets=None, mode=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) # Recast all low precision outputs back to float32 since we only casted # the inputs to bfloat16 and not targets. This is done so that we can preserve # precision when calculating the loss value. def _upcast_low_precision_outputs(output): if output.dtype == dtypes.bfloat16: return math_ops.cast(output, dtypes.float32) else: return output cloned_model.outputs = [ _upcast_low_precision_outputs(o) for o in cloned_model.outputs ] if isinstance(targets, tuple): targets = nest.flatten(targets) if mode == ModeKeys.PREDICT: _custom_compile_for_predict(cloned_model) else: cloned_model.compile(optimizer, model.loss, metrics=metrics_module.clone_metrics( model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics), target_tensors=targets) return cloned_model
def test_update_weights_creates_model_with_right_weights(self): # Given initial_model = create_2_layer_MLP() grads = initial_model.get_weights() # When to_be_updated_model = clone_model(initial_model) take_n_gradient_step( initial_model, to_be_updated_model, n_step=1, alpha=1.0, loss=(lambda y, p: p), data_x=np.array([[1]]), data_y=np.array([[1]]) ) to_be_updated_model_weights = [layer.kernel for layer in to_be_updated_model.layers if layer.trainable] # Then np.testing.assert_equal(to_be_updated_model_weights[0].numpy(), np.zeros((1, 2))) np.testing.assert_equal(to_be_updated_model_weights[1].numpy(), np.zeros((2, 1)))
def _clone_and_build_model(model, mode, inputs=None, targets=None): """Clone and build the given keras_model.""" # We need to set the import here since we run into a circular dependency # error. from tensorflow.python.keras import models # pylint: disable=g-import-not-at-top cloned_model = models.clone_model(model, input_tensors=inputs) # Compile and build model. if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = model.optimizer else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) # Recast all low precision outputs back to float32 since we only casted # the inputs to bfloat16 and not targets. This is done so that we can preserve # precision when calculating the loss value. def _upcast_low_precision_outputs(output): if output.dtype == dtypes.bfloat16: return math_ops.cast(output, dtypes.float32) else: return output cloned_model.outputs = [_upcast_low_precision_outputs(o) for o in cloned_model.outputs] if isinstance(targets, tuple): targets = nest.flatten(targets) if mode == ModeKeys.PREDICT and inputs is not None: # TPU predict case _custom_compile_for_predict(cloned_model) else: cloned_model.compile( optimizer, model.loss, metrics=metrics_module.clone_metrics(model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics), target_tensors=targets) return cloned_model
def _clone_prebuilt_model(build_fn): """Clones and compiles a pre-built model when build_fn is an existing Keras model instance. Arguments: build_fn : instance of Keras Model. Returns: copy of the input model with no training. """ model = clone_model(build_fn) # clone_model does not compy over compilation parameters, do those manually model_metadata = saving_utils.model_metadata(build_fn) if "training_config" in model_metadata: training_config = model_metadata["training_config"] else: raise ValueError("To use %s as `build_fn`, you must compile" "it first." % build_fn) model.compile( **saving_utils.compile_args_from_training_config(training_config)) return model
def run_kt_methods() -> None: """ Runs all the available KT methods. """ methods = generate_appropriate_methods(kt_methods, temperature, kd_lambda_supervised, pkt_lambda_supervised, n_submodels) results = [] for method in methods: kt_logging.info('Performing {}...'.format(method['name'])) trained_student, history = knowledge_transfer(clone_model(student), method['method'], method['loss']) # TODO model_path = os.path.join(tempfile.gettempdir(), next(tempfile._get_candidate_names()) + '.h5') # and save student model there, when we stop needing it, # because it is inefficient to have it in memory until - if ever - we need to save it. # That way, when the time comes, we will just need to move it to the out folder. results.append({ 'method': method['name'], 'network': trained_student, 'history': history.history, 'evaluation': None }) # Add baseline to the results list. results.append({ 'method': 'Teacher', 'network': teacher, 'history': None, 'evaluation': None }) kt_logging.info('Evaluating results...') evaluate_results(results) kt_logging.info('Saving student network(s)...') save_students(save_students_mode, results[:-1], out_folder, results_name_prefix) if save_results: kt_logging.info('Saving results...') save_res(results, join(out_folder, results_name_prefix + 'results.pkl'))
def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s' % (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_input_map = {} # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_input_map[layer.name] = tensor if layer in self.model._output_layers: tpu_targets.append(tensor) # Clone our CPU model, running within the TPU device context. with TPURewriteContext(tpu_input_map): # TODO(power): Replicate variables. with ops.device('/device:TPU:0'): self._cloned_model = models.clone_model(self.model) # Create a copy of the optimizer for this graph. if isinstance(self.model.optimizer, keras_optimizers.TFOptimizer): cloned_optimizer = keras_optimizers.TFOptimizer( self.model.optimizer.optimizer) else: logging.info('Cloning %s %s', self.model.optimizer.__class__.__name__, self._optimizer_config) cloned_optimizer = self.model.optimizer.__class__.from_config( self._optimizer_config) if is_training or is_test: self._cloned_model.compile( optimizer=_replicated_optimizer(cloned_optimizer), loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: self._cloned_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.train_function.outputs ] return [ self._cloned_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( self._cloned_model.train_function.outputs, name='outfeed-enqueue-train') ] elif is_test: self._cloned_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: self._cloned_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode
def _clone_and_build_model(mode, keras_model, custom_objects, features=None, labels=None): """Clone and build the given keras_model. Args: mode: training mode. keras_model: an instance of compiled keras model. custom_objects: Dictionary for custom objects. features: Dict of tensors. labels: Dict of tensors, or single tensor instance. Returns: The newly built model. """ # Set to True during training, False for inference. K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN) # Get list of inputs. if features is None: input_tensors = None else: input_tensors = _create_ordered_io(keras_model, estimator_io=features, is_input=True) # Get list of outputs. if labels is None: target_tensors = None elif isinstance(labels, dict): target_tensors = _create_ordered_io(keras_model, estimator_io=labels, is_input=False) else: target_tensors = [_convert_tensor(labels)] if keras_model._is_graph_network: if custom_objects: with CustomObjectScope(custom_objects): model = models.clone_model(keras_model, input_tensors=input_tensors) else: model = models.clone_model(keras_model, input_tensors=input_tensors) else: model = keras_model _in_place_subclassed_model_reset(model) if input_tensors is not None: model._set_inputs(input_tensors) # Compile/Build model if mode is model_fn_lib.ModeKeys.PREDICT: if isinstance(model, models.Sequential): model.build() else: if isinstance(keras_model.optimizer, optimizers.TFOptimizer): optimizer = keras_model.optimizer else: optimizer_config = keras_model.optimizer.get_config() optimizer = keras_model.optimizer.__class__.from_config( optimizer_config) optimizer.iterations = training_util.get_or_create_global_step() model.compile(optimizer, keras_model.loss, metrics=keras_model.metrics, loss_weights=keras_model.loss_weights, sample_weight_mode=keras_model.sample_weight_mode, weighted_metrics=keras_model.weighted_metrics, target_tensors=target_tensors) return model
def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False): """Replicates a model on different GPUs. Specifically, this function implements single-machine multi-GPU data parallelism. It works in the following way: - Divide the model's input(s) into multiple sub-batches. - Apply a model copy on each sub-batch. Every model copy is executed on a dedicated GPU. - Concatenate the results (on CPU) into one big batch. E.g. if your `batch_size` is 64 and you use `gpus=2`, then we will divide the input into 2 sub-batches of 32 samples, process each sub-batch on one GPU, then return the full batch of 64 processed samples. This induces quasi-linear speedup on up to 8 GPUs. This function is only available with the TensorFlow backend for the time being. Arguments: model: A Keras model instance. To avoid OOM errors, this model could have been built on CPU, for instance (see usage example below). gpus: Integer >= 2, number of on GPUs on which to create model replicas. cpu_merge: A boolean value to identify whether to force merging model weights under the scope of the CPU or not. cpu_relocation: A boolean value to identify whether to create the model's weights under the scope of the CPU. If the model is not defined under any preceding device scope, you can still rescue it by activating this option. Returns: A Keras `Model` instance which can be used just like the initial `model` argument, but which distributes its workload on multiple GPUs. Example 1: Training models with weights merge on CPU ```python import tensorflow as tf from keras.applications import Xception from keras.utils import multi_gpu_model import numpy as np num_samples = 1000 height = 224 width = 224 num_classes = 1000 # Instantiate the base model (or "template" model). # We recommend doing this with under a CPU device scope, # so that the model's weights are hosted on CPU memory. # Otherwise they may end up hosted on a GPU, which would # complicate weight sharing. with tf.device('/cpu:0'): model = Xception(weights=None, input_shape=(height, width, 3), classes=num_classes) # Replicates the model on 8 GPUs. # This assumes that your machine has 8 available GPUs. parallel_model = multi_gpu_model(model, gpus=8) parallel_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # Generate dummy data. x = np.random.random((num_samples, height, width, 3)) y = np.random.random((num_samples, num_classes)) # This `fit` call will be distributed on 8 GPUs. # Since the batch size is 256, each GPU will process 32 samples. parallel_model.fit(x, y, epochs=20, batch_size=256) # Save model via the template model (which shares the same weights): model.save('my_model.h5') ``` Example 2: Training models with weights merge on CPU using cpu_relocation ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_relocation=True) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Example 3: Training models with weights merge on GPU (recommended for NV-link) ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_merge=False) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Raises: ValueError: if the `gpus` argument does not match available devices. """ # pylint: disable=g-import-not-at-top from tensorflow.python.keras.layers.core import Lambda from tensorflow.python.keras.layers.merge import concatenate if isinstance(gpus, (list, tuple)): if len(gpus) <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `len(gpus) >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = len(gpus) target_gpu_ids = gpus else: if gpus <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `gpus >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = gpus target_gpu_ids = range(num_gpus) target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids] available_devices = _get_available_devices() available_devices = [ _normalize_device_name(name) for name in available_devices ] for device in target_devices: if device not in available_devices: raise ValueError('To call `multi_gpu_model` with `gpus=%s`, ' 'we expect the following devices to be available: %s. ' 'However this machine only has: %s. ' 'Try reducing `gpus`.' % (gpus, target_devices, available_devices)) def get_slice(data, i, parts): """Slice an array into `parts` slices and return slice `i`. Arguments: data: array to slice. i: index of slice to return. parts: number of slices to make. Returns: Slice `i` of `data`. """ shape = array_ops.shape(data) batch_size = shape[:1] input_shape = shape[1:] step = batch_size // parts if i == parts - 1: size = batch_size - step * i else: size = step size = array_ops.concat([size, input_shape], axis=0) stride = array_ops.concat([step, input_shape * 0], axis=0) start = stride * i return array_ops.slice(data, start, size) # Relocate the model definition under CPU device scope if needed if cpu_relocation: from tensorflow.python.keras.models import clone_model # pylint: disable=g-import-not-at-top with ops.device('/cpu:0'): model = clone_model(model) all_outputs = [] for i in range(len(model.outputs)): all_outputs.append([]) # Place a copy of the model on each GPU, # each getting a slice of the inputs. for i, gpu_id in enumerate(target_gpu_ids): with ops.device('/gpu:%d' % gpu_id): with ops.name_scope('replica_%d' % gpu_id): inputs = [] # Retrieve a slice of the input. for x in model.inputs: input_shape = tuple(x.get_shape().as_list())[1:] slice_i = Lambda( get_slice, output_shape=input_shape, arguments={ 'i': i, 'parts': num_gpus })( x) inputs.append(slice_i) # Apply model on slice # (creating a model replica on the target device). outputs = model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later. for o in range(len(outputs)): all_outputs[o].append(outputs[o]) # Deduplicate output names to handle Siamese networks. occurrences = {} for n in model.output_names: if n not in occurrences: occurrences[n] = 1 else: occurrences[n] += 1 conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1} output_names = [] for n in model.output_names: if n in conflict_counter: conflict_counter[n] += 1 n += '_%d' % conflict_counter[n] output_names.append(n) # Merge outputs under expected scope. with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]): merged = [] for name, outputs in zip(output_names, all_outputs): merged.append(concatenate(outputs, axis=0, name=name)) return Model(model.inputs, merged)
def _clone_and_build_model(mode, keras_model, custom_objects, features=None, labels=None): """Clone and build the given keras_model. Args: mode: training mode. keras_model: an instance of compiled keras model. custom_objects: Dictionary for custom objects. features: Dict of tensors. labels: Dict of tensors, or single tensor instance. Returns: The newly built model. """ # Set to True during training, False for inference. K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN) # Get list of inputs. if features is None: input_tensors = None else: input_tensors = _create_ordered_io(keras_model, estimator_io=features, is_input=True) # Get list of outputs. if labels is None: target_tensors = None elif isinstance(labels, dict): target_tensors = _create_ordered_io(keras_model, estimator_io=labels, is_input=False) else: target_tensors = [ _convert_tensor(labels) ] if keras_model._is_graph_network: if custom_objects: with CustomObjectScope(custom_objects): model = models.clone_model(keras_model, input_tensors=input_tensors) else: model = models.clone_model(keras_model, input_tensors=input_tensors) else: model = keras_model _in_place_subclassed_model_reset(model) if input_tensors is not None: model._set_inputs(input_tensors) # Compile/Build model if mode is model_fn_lib.ModeKeys.PREDICT: if isinstance(model, models.Sequential): model.build() else: if isinstance(keras_model.optimizer, optimizers.TFOptimizer): optimizer = keras_model.optimizer else: optimizer_config = keras_model.optimizer.get_config() optimizer = keras_model.optimizer.__class__.from_config(optimizer_config) optimizer.iterations = training_util.get_or_create_global_step() model.compile( optimizer, keras_model.loss, metrics=keras_model.metrics, loss_weights=keras_model.loss_weights, sample_weight_mode=keras_model.sample_weight_mode, weighted_metrics=keras_model.weighted_metrics, target_tensors=target_tensors) return model
def _model_fn(): """Compute fit/eval/predict for the TPU.""" is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT # During train/eval, we infeed our features as well as labels. if is_training or is_test: infeed_layers = self.model._input_layers + self.model._output_layers else: infeed_layers = self.model._input_layers # Generate our infeed operation to read features & labels. infeed_tensors = tpu_ops.infeed_dequeue_tuple( dtypes=[spec.dtype for spec in input_specs], shapes=[spec.shape for spec in input_specs], name='infeed-%s' % self.execution_mode) assert len(infeed_tensors) == len(infeed_layers), ( 'Infeed inputs did not match model: %s vs %s', (infeed_layers, infeed_tensors)) tpu_targets = [] tpu_input_map = {} # Sort infeed outputs into inputs and labels for calling our Keras model. for tensor, layer in zip(infeed_tensors, infeed_layers): if layer in self.model._input_layers: tpu_input_map[layer.name] = tensor if layer in self.model._output_layers: tpu_targets.append(tensor) # Clone our CPU model, running within the TPU device context. with TPURewriteContext(tpu_input_map): self._cloned_model = models.clone_model(self.model) # Create a copy of the optimizer for this graph. if isinstance(self.model.optimizer, keras_optimizers.TFOptimizer): cloned_optimizer = keras_optimizers.TFOptimizer( self.model.optimizer.optimizer) else: logging.info('Cloning %s %s', self.model.optimizer.__class__.__name__, self._optimizer_config) cloned_optimizer = self.model.optimizer.__class__.from_config( self._optimizer_config) if is_training or is_test: self._cloned_model.compile( optimizer=_replicated_optimizer(cloned_optimizer), loss=self.model.loss, loss_weights=self.model.loss_weights, metrics=self.model.metrics, weighted_metrics=self.model.weighted_metrics, target_tensors=tpu_targets, ) # Compute our outfeed depending on the execution mode if is_training: self._cloned_model._make_train_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.train_function.outputs ] return [ self._cloned_model.train_function.updates_op, tpu_ops.outfeed_enqueue_tuple( self._cloned_model.train_function.outputs, name='outfeed-enqueue-train') ] elif is_test: self._cloned_model._make_test_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.test_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.test_function.outputs, name='outfeed-enqueue-test') ] elif is_predict: self._cloned_model._make_predict_function() self._outfeed_spec = [ tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) for tensor in self._cloned_model.predict_function.outputs ] return [ tpu_ops.outfeed_enqueue_tuple( self._cloned_model.predict_function.outputs, name='outfeed-enqueue-predict', ) ] else: assert False, 'Unexpected execution mode: %s' % self.execution_mode
def clone_and_build_model(model, input_tensors=None, target_tensors=None, custom_objects=None, compile_clone=True, in_place_reset=False, optimizer_iterations=None, optimizer_config=None): orig_optimizer = model.optimizer if compile_clone and not orig_optimizer: raise ValueError( 'Error when cloning model: compile_clone was set to True, but the ' 'original model has not been compiled.') if model._is_graph_network or isinstance(model, Sequential): if custom_objects: with CustomObjectScope(custom_objects): clone = models.clone_model(model, input_tensors=input_tensors) else: clone = models.clone_model(model, input_tensors=input_tensors) if all([ isinstance(clone, Sequential), not clone._is_graph_network, getattr(model, '_build_input_shape', None) is not None ]): clone._set_inputs( K.placeholder(model._build_input_shape, dtype=model.inputs[0].dtype)) else: if not in_place_reset: raise ValueError('.') clone = model _in_place_subclassed_model_reset(clone) if input_tensors is not None: if isinstance(input_tensors, (list, tuple)) and len(input_tensors) == 1: input_tensors = input_tensors[0] clone._set_inputs(input_tensors) if compile_clone: if isinstance(orig_optimizer, optimizers.TFOptimizer): optimizer = optimizers.TFOptimizer(orig_optimizer.optimizer, optimizer_iterations) K.track_tf_optimizer(optimizer) else: optimizer_config = optimizer_config or orig_optimizer.get_config() # print("orig_optimizer :", orig_optimizer) # print("orig_optimizer.c . :",orig_optimizer.__class__) # print("orig_optimizer.c.i . :", orig_optimizer.__class__.__init__) # print("optimizer_config :", optimizer_config) # print("orig_optimizer.c.i.args :", inspect.getargspec(orig_optimizer.__class__.__init__)) # print("orig_optimizer.c.i.dict :", orig_optimizer.__class__.__dict__) # print("orig_optimizer.c._b_ . :", orig_optimizer.__class__.__bases__) #orig_optimizer : <horovod._keras.Adam object at 0x7f803812aac8> #orig_optimizer.c . : <class 'horovod._keras.Adam'> #orig_optimizer.c.i . : <function create_distributed_optimizer.<locals>._DistributedOptimizer.__init__ at 0x7f80480840d0> #optimizer_config : {'name': 'Adam', 'learning_rate': 0.0014000001, 'decay': 0.0002, 'beta_1': 0.9, 'beta_2': 0.999, 'epsilon': 1e-07, 'amsgrad': False} #orig_optimizer.c.i.args : ArgSpec(args=['self', 'name', 'device_dense', 'device_sparse', 'compression', 'sparse_as_dense', 'config'], varargs=None, keywords=None, defaults=None) #orig_optimizer.c.i.dict : {'__module__': 'horovod._keras', '__init__': <function create_distributed_optimizer.<locals>._DistributedOptimizer.__init__ at 0x7f80480840d0>, 'get_gradients': <function create_distributed_optimizer.<locals>._DistributedOptimizer.get_gradients at 0x7f80480847b8>, '__doc__': None, '__abstractmethods__': frozenset(), '_abc_registry': <_weakrefset.WeakSet object at 0x7f8038112cf8>, '_abc_cache': <_weakrefset.WeakSet object at 0x7f803812ac50>, '_abc_negative_cache': <_weakrefset.WeakSet object at 0x7f803812a6d8>, '_abc_negative_cache_version': 51} if "horovod._keras" not in str(type(orig_optimizer)): optimizer = orig_optimizer.__class__.from_config( optimizer_config) else: optimizer = orig_optimizer.__class__.__bases__[0].from_config( optimizer_config) if optimizer_iterations is not None: optimizer.iterations = optimizer_iterations clone.compile(optimizer, model.loss, metrics=metrics_module.clone_metrics( model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics), target_tensors=target_tensors) return clone
def clone_and_build_model(model, input_tensors=None, target_tensors=None, custom_objects=None, compile_clone=True, in_place_reset=False, optimizer_iterations=None): """1.13""" if compile_clone and not model.optimizer: raise ValueError( 'Error when cloning model: compile_clone was set to True, but the ' 'original model has not been compiled.') if model._is_graph_network or isinstance(model, Sequential): if custom_objects: with CustomObjectScope(custom_objects): clone = models.clone_model(model, input_tensors=input_tensors) else: clone = models.clone_model(model, input_tensors=input_tensors) if all([ isinstance(clone, Sequential), not models.clone._is_graph_network, getattr(model, '_build_input_shape', None) is not None ]): clone._set_inputs( K.placeholder(model._build_input_shape, dtype=model.inputs[0].dtype)) else: if not in_place_reset: raise ValueError('.') clone = model _in_place_subclassed_model_reset(clone) if input_tensors is not None: if isinstance(input_tensors, (list, tuple)) and len(input_tensors) == 1: input_tensors = input_tensors[0] models.clone._set_inputs(input_tensors) if compile_clone and model.optimizer: if isinstance(model.optimizer, optimizers.TFOptimizer): optimizer = optimizers.TFOptimizer(model.optimizer.optimizer, optimizer_iterations) K.track_tf_optimizer(optimizer) else: optimizer_config = model.optimizer.get_config() optimizer = model.optimizer.__class__.from_config(optimizer_config) if optimizer_iterations is not None: optimizer.iterations = optimizer_iterations models.clone.compile(optimizer, model.loss, metrics=metrics_module.clone_metrics( model._compile_metrics), loss_weights=model.loss_weights, sample_weight_mode=model.sample_weight_mode, weighted_metrics=metrics_module.clone_metrics( model._compile_weighted_metrics), target_tensors=target_tensors) return clone
def pre_build(self, *args): self.model = clone_model(self.model)
def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False): """Replicates a model on different GPUs. Specifically, this function implements single-machine multi-GPU data parallelism. It works in the following way: - Divide the model's input(s) into multiple sub-batches. - Apply a model copy on each sub-batch. Every model copy is executed on a dedicated GPU. - Concatenate the results (on CPU) into one big batch. E.g. if your `batch_size` is 64 and you use `gpus=2`, then we will divide the input into 2 sub-batches of 32 samples, process each sub-batch on one GPU, then return the full batch of 64 processed samples. This induces quasi-linear speedup on up to 8 GPUs. This function is only available with the TensorFlow backend for the time being. Args: model: A Keras model instance. To avoid OOM errors, this model could have been built on CPU, for instance (see usage example below). gpus: Integer >= 2, number of on GPUs on which to create model replicas. cpu_merge: A boolean value to identify whether to force merging model weights under the scope of the CPU or not. cpu_relocation: A boolean value to identify whether to create the model's weights under the scope of the CPU. If the model is not defined under any preceding device scope, you can still rescue it by activating this option. Returns: A Keras `Model` instance which can be used just like the initial `model` argument, but which distributes its workload on multiple GPUs. Example 1: Training models with weights merge on CPU ```python import tensorflow as tf from keras.applications import Xception from keras.utils import multi_gpu_model import numpy as np num_samples = 1000 height = 224 width = 224 num_classes = 1000 # Instantiate the base model (or "template" model). # We recommend doing this with under a CPU device scope, # so that the model's weights are hosted on CPU memory. # Otherwise they may end up hosted on a GPU, which would # complicate weight sharing. with tf.device('/cpu:0'): model = Xception(weights=None, input_shape=(height, width, 3), classes=num_classes) # Replicates the model on 8 GPUs. # This assumes that your machine has 8 available GPUs. parallel_model = multi_gpu_model(model, gpus=8) parallel_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # Generate dummy data. x = np.random.random((num_samples, height, width, 3)) y = np.random.random((num_samples, num_classes)) # This `fit` call will be distributed on 8 GPUs. # Since the batch size is 256, each GPU will process 32 samples. parallel_model.fit(x, y, epochs=20, batch_size=256) # Save model via the template model (which shares the same weights): model.save('my_model.h5') ``` Example 2: Training models with weights merge on CPU using cpu_relocation ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_relocation=True) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Example 3: Training models with weights merge on GPU (recommended for NV-link) ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_merge=False) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Raises: ValueError: if the `gpus` argument does not match available devices. """ if isinstance(gpus, (list, tuple)): if len(gpus) <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `len(gpus) >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = len(gpus) target_gpu_ids = gpus else: if gpus <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `gpus >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = gpus target_gpu_ids = range(num_gpus) target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids] available_devices = _get_available_devices() available_devices = [ _normalize_device_name(name) for name in available_devices ] for device in target_devices: if device not in available_devices: raise ValueError('To call `multi_gpu_model` with `gpus=%s`, ' 'we expect the following devices to be available: %s. ' 'However this machine only has: %s. ' 'Try reducing `gpus`.' % (gpus, target_devices, available_devices)) def get_slice(data, i, parts): """Slice an array into `parts` slices and return slice `i`. Args: data: array to slice. i: index of slice to return. parts: number of slices to make. Returns: Slice `i` of `data`. """ shape = array_ops.shape(data) batch_size = shape[:1] input_shape = shape[1:] step = batch_size // parts if i == parts - 1: size = batch_size - step * i else: size = step size = array_ops.concat([size, input_shape], axis=0) stride = array_ops.concat([step, input_shape * 0], axis=0) start = stride * i return array_ops.slice(data, start, size) # Relocate the model definition under CPU device scope if needed if cpu_relocation: from tensorflow.python.keras.models import clone_model # pylint: disable=g-import-not-at-top with ops.device('/cpu:0'): model = clone_model(model) all_outputs = [[] for _ in range(len(model.outputs))] # Place a copy of the model on each GPU, # each getting a slice of the inputs. for i, gpu_id in enumerate(target_gpu_ids): with ops.device('/gpu:%d' % gpu_id): with backend.name_scope('replica_%d' % gpu_id): inputs = [] # Retrieve a slice of the input. for x in model.inputs: input_shape = tuple(x.shape.as_list())[1:] slice_i = Lambda( get_slice, output_shape=input_shape, arguments={ 'i': i, 'parts': num_gpus })( x) inputs.append(slice_i) # Apply model on slice # (creating a model replica on the target device). outputs = model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later. for o, output in enumerate(outputs): all_outputs[o].append(output) # Deduplicate output names to handle Siamese networks. occurrences = {} for n in model.output_names: if n not in occurrences: occurrences[n] = 1 else: occurrences[n] += 1 conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1} output_names = [] for n in model.output_names: if n in conflict_counter: conflict_counter[n] += 1 n += '_%d' % conflict_counter[n] output_names.append(n) # Merge outputs under expected scope. with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]): merged = [] for name, outputs in zip(output_names, all_outputs): merged.append(concatenate(outputs, axis=0, name=name)) return Model(model.inputs, merged)