def _get_input_iterator( self, input_fn: Callable[[Optional[params_dict.ParamsDict]], tf.data.Dataset], strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]: """Returns distributed dataset iterator. Args: input_fn: (params: dict) -> tf.data.Dataset. strategy: an instance of tf.distribute.Strategy. Returns: An iterator that yields input tensors. """ if input_fn is None: return None # When training with multiple TPU workers, datasets needs to be cloned # across workers. Since Dataset instance cannot be cloned in eager mode, # we instead pass callable that returns a dataset. if self._is_multi_host: return iter( strategy.experimental_distribute_datasets_from_function( input_fn)) else: input_data = input_fn(self._params) return iter(strategy.experimental_distribute_dataset(input_data))
def create_dataset( dataset_builder: base.BaseDataset, batch_size: int, process_fn: Any, distributed_strategy: tf.distribute.Strategy, distributed: bool ) -> Union[tf.data.Dataset, tf.distribute.DistributedDataset]: """Creates (optionally distributed) dataset from dataset_builder and process_fn.""" dataset = dataset_builder.load(batch_size=batch_size).map(process_fn) if distributed: dataset = distributed_strategy.experimental_distribute_dataset(dataset) return dataset
def read( self, mode: str, mirrored_strategy: tf.distribute.Strategy = None ) -> tf.data.Dataset: if mirrored_strategy: num_gpus = mirrored_strategy.num_replicas_in_sync with mirrored_strategy.scope(): dataset, num_iters = self._read(mode, self._batch_size * num_gpus) dataset = mirrored_strategy.experimental_distribute_dataset( dataset) return dataset, num_iters else: return self._read(mode, self._batch_size)
def compute_predictions( model: PredictionModel, dataset: tf.data.Dataset, strategy: tf.distribute.Strategy, batch_size: int ) -> Iterator[Tuple[types.ModelPredictions, types.Features]]: """Yield the predictions of the model on the given dataset. Args: model: A function that takes tensor-valued features and returns a vector of predictions. dataset: The dataset that the function consumes to produce the predictions. strategy: The distribution strategy to use when computing. batch_size: The batch size that should be used. Yields: Pairs of model predictions and the corresponding metadata. """ with strategy.scope(): dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.DATA) dataset = dataset.with_options(options) for features in strategy.experimental_distribute_dataset(dataset): time_start = time.time() if isinstance(strategy, tf.distribute.experimental.TPUStrategy): # TODO(josipd): Figure this out better. We can't easily filter, # as they are PerReplica values, not tensors. features_model = {"image": features["image"]} else: features_model = features predictions = materialize(strategy, strategy.run(model, args=(features_model,))) time_end = time.time() time_delta_per_example = (time_end - time_start) / predictions.shape[0] metadatas = materialize(strategy, features["metadata"]) for i in range(predictions.shape[0]): model_predictions = types.ModelPredictions( predictions=[predictions[i]], time_in_s=time_delta_per_example) metadata_i = _slice_dictionary(metadatas, i) yield model_predictions, metadata_i
def compute_predictions( model: PredictionModel, dataset: tf.data.Dataset, strategy: tf.distribute.Strategy) -> Iterator[types.ModelPredictions]: """Yield the predictions of the model on the given dataset. Note that the dataset is expected to yield batches of tensors. Args: model: A function that takes tensor-valued features and returns a vector of predictions. dataset: The dataset that the function consumes to produce the predictions. strategy: The distribution strategy to use when computing. Yields: The predictions of the model on the dataset. """ for features in strategy.experimental_distribute_dataset(dataset): # TODO(josipd): Figure out how to pass only tpu-allowed types. time_start = time.time() predictions = materialize( strategy, strategy.run(model, args=({ "image": features["image"] }, ))) time_end = time.time() time_delta_per_example = (time_end - time_start) / predictions.shape[0] try: element_ids = materialize(strategy, features["element_id"]) except KeyError: element_ids = [None] * predictions.shape[0] metadatas = materialize(strategy, features["metadata"]) for i in range(predictions.shape[0]): yield types.ModelPredictions(element_id=element_ids[i], metadata=_slice_dictionary( metadatas, i), predictions=[predictions[i]], time_in_s=time_delta_per_example)
def get_datasets(args, strategy: tf.distribute.Strategy, buffer_size: int = 256): """ Load and return preprocessed and distributed horse2zebra dataset """ dataset, metadata = tfds.load("cycle_gan/horse2zebra", with_info=True, as_supervised=True) train_horses, train_zebras = dataset["trainA"], dataset["trainB"] test_horses, test_zebras = dataset["testA"], dataset["testB"] # calculate the number of train and test steps needed per epoch get_size = lambda name: metadata.splits.get(name).num_examples num_train_samples = min([get_size('trainA'), get_size('trainB')]) num_test_samples = min([get_size('testA'), get_size('testB')]) args.train_steps = ceil(num_train_samples / args.global_batch_size) args.test_steps = ceil(num_test_samples / args.global_batch_size) def normalize_image(image): """ normalize image to [-1, 1] """ image = tf.cast(image, dtype=tf.float32) return (image / 127.5) - 1.0 def preprocess_train(image, _): image = tf.image.random_flip_left_right(image) image = tf.image.resize(image, size=IMAGE_SHAPE) image = tf.image.random_crop(image, size=INPUT_SHAPE) image = normalize_image(image) return image def preprocess_test(image, _): image = tf.image.resize(image, size=INPUT_SHAPE[:2]) image = normalize_image(image) return image train_horses = train_horses.take(num_train_samples) train_horses = train_horses.map(preprocess_train, num_parallel_calls=AUTOTUNE) train_horses = train_horses.cache() train_horses = train_horses.shuffle(buffer_size) train_zebras = train_zebras.take(num_train_samples) train_zebras = train_zebras.map(preprocess_train, num_parallel_calls=AUTOTUNE) train_zebras = train_zebras.cache() train_zebras = train_zebras.shuffle(buffer_size) test_horses = test_horses.take(num_test_samples) test_horses = test_horses.map(preprocess_test, num_parallel_calls=AUTOTUNE) test_horses = test_horses.cache() test_zebras = test_zebras.take(num_test_samples) test_zebras = test_zebras.map(preprocess_test, num_parallel_calls=AUTOTUNE) test_zebras = test_zebras.cache() train_ds = tf.data.Dataset.zip( (train_horses.batch(args.global_batch_size), train_zebras.batch(args.global_batch_size))).prefetch(AUTOTUNE) test_ds = tf.data.Dataset.zip((test_horses.batch(args.global_batch_size), test_zebras.batch(args.global_batch_size))) # take 5 samples from the test set for plotting plot_ds = tf.data.Dataset.zip( (test_horses.take(5).batch(1), test_zebras.take(5).batch(1))) # create distributed datasets train_ds = strategy.experimental_distribute_dataset(train_ds) test_ds = strategy.experimental_distribute_dataset(test_ds) return train_ds, test_ds, plot_ds
def run(train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, graph_augmenter: augmentation_utils.GraphAugment): """Trains and evaluates the model.""" with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer) optimizer = tf.keras.optimizers.RMSprop( learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy( ) metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean( ) if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[ f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 if params.augmentations: # TODO(jihyeonlee): For now, choose 1 augmentation function from all # possible with equal probability. Allow user to specify number of # augmentations to apply per graph. features = graph_augmenter.augment(features) with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) @tf.function def eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state( labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))
def run( train_dataset: tf.data.Dataset, eval_datasets: Dict[str, tf.data.Dataset], steps_per_eval: Dict[str, int], params: utils.ModelParameters, model_dir: str, gp_layer_kwargs: Dict[str, Any], strategy: tf.distribute.Strategy, summary_writer: tf.summary.SummaryWriter, loss_type: str, use_spec_norm: bool, spec_norm_multiplier: float, use_spec_norm_mp: bool, spec_norm_multiplier_mp: float): """Trains and evaluates the model. Args: train_dataset: tf dataset that provides training data. eval_datasets: A dictionary of tf datasets that provides data for model evaluation. steps_per_eval: A dictionary of steps needed for each evaluation dataset. params: ModelParameters object containing MPNN model parameters. model_dir: Directory for files generated during training and evaluation. gp_layer_kwargs: A dictionary of parameters used for GP layer. strategy: tf Distributed training strategy object. summary_writer: tf summary writer to log training and evaluation metrics. loss_type: str, loss type to use during training. Currently only supports focal loss and cross-entropy loss. use_spec_norm: Whether to use Spectral normalization for the dense layer. spec_norm_multiplier: Multiplier used to control the magnitude of eigenvalue of the dense layer weight matrix. use_spec_norm_mp: Whether to use Spectral normalization for the MP layer. spec_norm_multiplier_mp: Multiplier used to control the magnitude of eigenvalue of the MP layer weight matrix. """ with strategy.scope(): model = ub.models.mpnn( nodes_shape=train_dataset.element_spec[0]['atoms'].shape[1:], edges_shape=train_dataset.element_spec[0]['pairs'].shape[1:], num_heads=params.num_heads, num_layers=params.num_layers, message_layer_size=params.message_layer_size, readout_layer_size=params.readout_layer_size, use_gp_layer=params.use_gp_layer, gp_layer_kwargs=gp_layer_kwargs, use_spec_norm=use_spec_norm, spec_norm_multiplier=spec_norm_multiplier, use_spec_norm_mp=use_spec_norm_mp, spec_norm_multiplier_mp=spec_norm_multiplier_mp) optimizer = tf.keras.optimizers.RMSprop(learning_rate=params.learning_rate) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.CategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/roc_auc': tf.keras.metrics.AUC(), } for dataset_name in eval_datasets: metrics[ f'{dataset_name}/accuracy'] = tf.keras.metrics.CategoricalAccuracy() metrics[f'{dataset_name}/roc_auc'] = tf.keras.metrics.AUC() metrics[ f'{dataset_name}/negative_log_likelihood'] = tf.keras.metrics.Mean() if dataset_name == 'test2': ece_num_bins = 5 else: ece_num_bins = 10 metrics[f'{dataset_name}/ece'] = rm.metrics.ExpectedCalibrationError( num_bins=ece_num_bins) metrics[f'{dataset_name}/brier'] = rm.metrics.Brier() def per_replica_train_step_fn(inputs): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, sample_weights = inputs else: features, labels = inputs sample_weights = 1 with tf.GradientTape() as tape: probs = model(features, training=True) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs) * sample_weights) l2_loss = sum(model.losses) if loss_type == 'focal': focal_loss_fn = tfa_losses.SigmoidFocalCrossEntropy() focal_loss = tf.reduce_mean( focal_loss_fn(labels, probs) * sample_weights) loss = focal_loss + l2_loss else: loss = negative_log_likelihood + l2_loss # Scale the loss given the tf.distribute.Strategy will reduce sum all # gradients. See details in # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) metrics['train/roc_auc'].update_state(labels[:, 1], probs[:, 1]) def per_replica_eval_step_fn(inputs, dataset_name): """Per-Replica StepFn.""" if len(inputs) == 3: features, labels, _ = inputs else: features, labels = inputs probs = model(features, training=False) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.categorical_crossentropy(labels, probs)) metrics[f'{dataset_name}/negative_log_likelihood'].update_state( negative_log_likelihood) metrics[f'{dataset_name}/accuracy'].update_state(labels, probs) metrics[f'{dataset_name}/roc_auc'].update_state(labels[:, 1], probs[:, 1]) metrics[f'{dataset_name}/ece'].add_batch(probs[:, 1], label=labels[:, 1]) metrics[f'{dataset_name}/brier'].add_batch(probs, label=labels[:, 1]) @tf.function def distributed_train_step(iterator): """Training StepFn.""" for _ in tf.range(tf.cast(params.steps_per_epoch, tf.int32)): strategy.run(per_replica_train_step_fn, args=(next(iterator),)) @tf.function def distributed_eval_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run( per_replica_eval_step_fn, args=(next(iterator), dataset_name)) # Makes datasets into distributed version. train_dataset = strategy.experimental_distribute_dataset(train_dataset) eval_datasets = { ds_name: strategy.experimental_distribute_dataset(ds) for ds_name, ds in eval_datasets.items() } logging.info('Number of replicas in sync: %s', strategy.num_replicas_in_sync) train_iterator = iter(train_dataset) start_time = time.time() metrics_history = collections.defaultdict(list) for epoch in range(params.num_epochs): logging.info('Starting to run epoch: %s', epoch) distributed_train_step(train_iterator) current_step = (epoch + 1) * params.steps_per_epoch max_steps = params.steps_per_epoch * params.num_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, params.num_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) # Start evaluation. logging.info('Starting to run eval at epoch: %s', epoch) for dataset_name, eval_dataset in eval_datasets.items(): eval_iterator = iter(eval_dataset) distributed_eval_step(eval_iterator, dataset_name, steps_per_eval[dataset_name]) metrics_history['epoch'].append(epoch + 1) with summary_writer.as_default(): for name, metric in metrics.items(): result = utils.get_metric_result_value(metric) tf.summary.scalar(name, result, step=epoch + 1) metrics_history[name].append(str(result)) for metric in metrics.values(): metric.reset_states() model.save(os.path.join(model_dir, f'model_{epoch + 1}'), overwrite=True) utils.write_params(metrics_history, os.path.join(model_dir, 'metrics_history.json'))