def config_gpus(): """ Multiple GPUs are being used across the workers. This function initializes GPUs for use. :return: None """ gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU')
def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) # Limit available GPU memory (tune the size) if use_dali: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = False else: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.visible_device_list = str(gpu_id) if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory # Bug - disable bn+relu fusion from tensorflow.core.protobuf import rewriter_config_pb2 config.graph_options.rewrite_options.remapping = ( rewriter_config_pb2.RewriterConfig.OFF) if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def __init__(self, filenames, idx_filenames, height, width, batch_size, num_threads, dtype=tf.uint8, dali_cpu=True, deterministic=False, training=False): device_id = hvd.local_rank() shard_id = hvd.rank() num_gpus = hvd.size() pipe = HybridPipe(tfrec_filenames=filenames, tfrec_idx_filenames=idx_filenames, height=height, width=width, batch_size=batch_size, num_threads=num_threads, device_id=device_id, shard_id=shard_id, num_gpus=num_gpus, deterministic=deterministic, dali_cpu=dali_cpu, training=training) daliop = dali_tf.DALIIterator() with tf.device("/gpu:0"): self.images, self.labels = daliop(pipeline=pipe, shapes=[(batch_size, height, width, 3), (batch_size, 1)], dtypes=[tf.float32, tf.int64], device_id=device_id)
def MPI_local_rank(): return hr.local_rank()
logger = logging.getLogger(__name__) logging.basicConfig( level=logging.getLevelName("INFO"), handlers=[logging.StreamHandler(sys.stdout)], format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) if SDP_ENABLED: sdp.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], "GPU") # Load model and tokenizer model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name) # get datasets tf_train_dataset, tf_test_dataset = get_datasets() # fine optimizer and loss optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # Training
def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer model_type = args.model_type # SageMaker options training_dir = args.train validation_dir = args.validation eval_dir = args.eval # Change: Initialize SMDataParallel and get the size of the cluster smdp.init() size = smdp.size() # Change: Pin GPU to local process (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API] tf.config.experimental.set_visible_devices(gpus[smdp.local_rank()], 'GPU') # Get dataset train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size) train_dataset = train_dataset.take(NUM_TRAIN_IMAGES // size).shuffle(10000) val_dataset = get_dataset(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size) # Load model model = get_model(model_type) # Optimizer if optimizer.lower() == 'adam': opt = Adam(lr=lr * size, decay=weight_decay) elif optimizer.lower() == 'rmsprop': opt = RMSprop(lr=lr * size, decay=weight_decay) else: opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum) # Loss function loss = tf.keras.losses.CategoricalCrossentropy() # Metrics to track train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.CategoricalAccuracy( name='train_accuracy') val_loss = tf.keras.metrics.Mean(name='val_loss') val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy') test_loss = tf.keras.metrics.Mean(name='test_loss') test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy') # Training step @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: train_pred = model(images, training=True) loss_value = loss(labels, train_pred) # Change: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape tape = smdp.DistributedGradientTape(tape) grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: # Change: Broadcast model and optimizer variables smdp.broadcast_variables(model.variables, root_rank=0) smdp.broadcast_variables(opt.variables(), root_rank=0) # Change: all_reduce call train_loss_value = smdp.oob_allreduce( loss_value) # Average the loss across workers train_loss(train_loss_value) train_accuracy(labels, train_pred) return # Test step @tf.function def test_step(images, labels): val_pred = model(images, training=False) val_loss_value = loss(labels, val_pred) val_loss(val_loss_value) val_accuracy(labels, val_pred) return if smdp.rank() == 0: tb_log_dir = '/opt/ml/output/tensorboard/' train_summary_writer = tf.summary.create_file_writer(tb_log_dir) test_summary_writer = tf.summary.create_file_writer(tb_log_dir) # Training loop for epoch in range(epochs): train_loss.reset_states() train_accuracy.reset_states() val_loss.reset_states() val_accuracy.reset_states() for batch, (images, labels) in enumerate(train_dataset): start_time = time.time() training_step(images, labels, batch == 0) epoch_time = time.time() - start_time for images, labels in val_dataset: test_step(images, labels) if smdp.rank() == 0: with train_summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=epoch) tf.summary.scalar('train_accuracy', train_accuracy.result(), step=epoch) with test_summary_writer.as_default(): tf.summary.scalar('val_loss', val_loss.result(), step=epoch) tf.summary.scalar('val_accuracy', val_accuracy.result(), step=epoch) print( f'Epoch: {epoch + 1}, ' f'Epoch duration: {epoch_time} sec, ' f'Training loss: {train_loss.result()}, ' f'Training accuracy: {train_accuracy.result() * 100}', f'Validation Loss: {val_loss.result()}, ' f'Validation Accuracy: {val_accuracy.result() * 100}') for images, labels in eval_dataset: test_pred = model(images, training=False) test_loss_value = loss(labels, test_pred) test_loss(test_loss_value) test_accuracy(labels, test_pred) print('====== Test Results ======') print(f'Test loss: {test_loss.result()}, ' f'Test accuracy: {test_accuracy.result() * 100}') print('====== End of training ======') # Change: Save checkpoints only from master node. if smdp.rank() == 0: model.save(os.path.join(os.environ["SM_MODEL_DIR"], '1'))
# # http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and limitations under the License. import tensorflow as tf tf.random.set_seed(42) import smdistributed.dataparallel.tensorflow as dist dist.init() gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % dist.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(),
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments) ) ( model_args, data_args, train_args, log_args, path_args, remaining_strings, ) = parser.parse_args_into_dataclasses(return_remaining_strings=True) # SageMaker may have some extra strings. TODO: Test this on SM. assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed." tf.random.set_seed(train_args.seed) tf.autograph.set_verbosity(0) # Settings init parse_bool = lambda arg: arg == "true" do_gradient_accumulation = train_args.gradient_accumulation_steps > 1 do_xla = not parse_bool(train_args.skip_xla) do_eager = parse_bool(train_args.eager) skip_sop = parse_bool(train_args.skip_sop) skip_mlm = parse_bool(train_args.skip_mlm) pre_layer_norm = parse_bool(model_args.pre_layer_norm) fast_squad = parse_bool(log_args.fast_squad) dummy_eval = parse_bool(log_args.dummy_eval) is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker global max_grad_norm max_grad_norm = train_args.max_grad_norm # TODO : Change to obfuscate smddpcommon. This code does not use GradientTape, so need to pass it like this. if train_args.bucket_cap_mb: bucket_cap_bytes = int(train_args.bucket_cap_mb * 1024 * 1024) else: bucket_cap_bytes = int(64 * 1024 * 1024) hc.setBucketSize(bucket_cap_bytes) gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.set_visible_devices(gpus[smddp.local_rank()], "GPU") # XLA, AutoGraph tf.config.optimizer.set_jit(do_xla) tf.config.experimental_run_functions_eagerly(do_eager) if smddp.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "sm" if is_sagemaker else "eks" if skip_sop: loss_str = "-skipsop" elif skip_mlm: loss_str = "-skipmlm" else: loss_str = "" if log_args.run_name is None: metadata = ( f"{model_args.model_type}" f"-{model_args.model_size}" f"-{model_args.load_from}" f"-{smddp.size()}gpus" f"-{train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps}globalbatch" f"-{train_args.learning_rate}maxlr" f"-{train_args.learning_rate_decay_power}power" f"-{train_args.optimizer}opt" f"-{train_args.total_steps}steps" f"-{'preln' if pre_layer_norm else 'postln'}" f"{loss_str}" f"-{model_args.hidden_dropout_prob}dropout" ) run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}" else: run_name = log_args.run_name # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" if not os.path.exists(path_args.log_dir): os.makedirs(path_args.log_dir) handlers = [ logging.FileHandler( os.path.join(path_args.filesystem_prefix, path_args.log_dir, f"{run_name}.log") ), TqdmLoggingHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Check that arguments passed in properly, only after registering the alert_func and logging assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm" wrap_global_functions(do_gradient_accumulation) # Create optimizer and enable AMP loss scaling. if train_args.optimizer == "lamb": optimizer = get_lamb_optimizer(train_args) elif train_args.optimizer == "adamw": optimizer = get_adamw_optimizer(train_args) if _PRE_TF_2_4_0: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) else: optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) gradient_accumulator = GradientAccumulator() loaded_optimizer_weights = None model = create_model(model_class=TFAutoModelForPreTraining, model_args=model_args) tokenizer = create_tokenizer(model_args.model_type) if model_args.load_from == "checkpoint": checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path) model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path) if smddp.rank() == 0: model.load_weights(model_ckpt) if model_args.load_optimizer_state == "true": loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True) # We do not set the weights yet, we have to do a first step to initialize the optimizer. # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories # Move to same folder structure and remove if/else train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord") validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord") train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) train_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=train_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # Of shape [per_gpu_batch_size, ...] # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...] train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps) # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps. train_dataset = train_dataset.prefetch(buffer_size=8) # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks if smddp.rank() == 0: validation_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=validation_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # validation_dataset = validation_dataset.batch(1) validation_dataset = validation_dataset.prefetch(buffer_size=8) pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step logger.info(f"Starting training, job name {run_name}") i = 1 start_time = time.perf_counter() train_start_time = time.perf_counter() for batch in train_dataset: learning_rate = optimizer.learning_rate(step=tf.constant(i, dtype=tf.float32)) # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32)) loss_scale = optimizer.loss_scale() if _PRE_TF_2_4_0 else optimizer.loss_scale loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step( model=model, optimizer=optimizer, gradient_accumulator=gradient_accumulator, batch=batch, gradient_accumulation_steps=train_args.gradient_accumulation_steps, skip_sop=skip_sop, skip_mlm=skip_mlm, ) # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors if i == 1: if smddp.rank() == 0 and loaded_optimizer_weights is not None: optimizer.set_weights(loaded_optimizer_weights) print (" RANK {} is broadcasting".format(smddp.rank())) #smddp.broadcast_variables(model.variables + optimizer.variables(), root_rank=0) smddp.broadcast_variables(model.variables, root_rank=0) smddp.broadcast_variables(optimizer.variables(), root_rank=0) print(" RANK {} is done broadcasting".format(smddp.rank())) # smddp.broadcast_variables(optimizer.variables(), root_rank=0) i = optimizer.get_weights()[0] is_final_step = i >= train_args.total_steps do_squad = (log_args.squad_frequency != 0) and ( (i % log_args.squad_frequency == 0) or is_final_step ) # Squad requires all the ranks to train, but results are only returned on rank 0 if do_squad: from albert.run_squad import get_squad_results_while_pretraining squad_results = get_squad_results_while_pretraining( model=model, tokenizer=tokenizer, model_size=model_args.model_size, filesystem_prefix=path_args.filesystem_prefix, step=i, dataset=data_args.squad_version, fast=log_args.fast_squad, dummy_eval=log_args.dummy_eval, ) if smddp.rank() == 0: squad_exact, squad_f1 = squad_results["exact"], squad_results["f1"] logger.info(f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}") # Re-wrap autograph so it doesn't get arg mismatches wrap_global_functions(do_gradient_accumulation) gc.collect() if smddp.rank() == 0: do_log = i % log_args.log_frequency == 0 do_checkpoint = (log_args.checkpoint_frequency != 0) and ( (i % log_args.checkpoint_frequency == 0) or is_final_step ) do_validation = (log_args.validation_frequency != 0) and ( (i % log_args.validation_frequency == 0) or is_final_step ) pbar.update(1) description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}" pbar.set_description(description) if do_log: elapsed_time = time.perf_counter() - start_time if i == 1: logger.info(f"First step: {elapsed_time:.3f} secs") elif is_final_step: total_time = time.perf_counter() - train_start_time seq_per_sec = i * train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps / total_time logger.info(f"Final step {i}: {description} -- Average seq_per_sec: {seq_per_sec:.2f} -- Total Time: {total_time}") else: it_per_sec = log_args.log_frequency / elapsed_time logger.info(f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}") start_time = time.perf_counter() if do_checkpoint: checkpoint_prefix = os.path.join( path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{i}" ) model_ckpt = f"{checkpoint_prefix}.ckpt" optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy" logger.info(f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}") model.save_weights(model_ckpt) # model.load_weights(model_ckpt) optimizer_weights = optimizer.get_weights() np.save(optimizer_ckpt, optimizer_weights) # optimizer.set_weights(optimizer_weights) if do_validation: val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation( model=model, validation_dataset=validation_dataset, skip_sop=skip_sop, skip_mlm=skip_mlm, ) description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}" logger.info(f"Validation step {i} -- {description}") # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name) ) config = { **asdict(model_args), **asdict(data_args), **asdict(train_args), **asdict(log_args), "global_batch_size": train_args.per_gpu_batch_size * smddp.size(), } if is_wandb_available(): wandb.init(config=config, project=model_args.model_type) wandb.run.save() wandb_run_name = wandb.run.name train_metrics = { "weight_norm": weight_norm, "grad_norm": grad_norm, "loss_scale": loss_scale, "learning_rate": learning_rate, "train/loss": loss, "train/mlm_loss": mlm_loss, "train/mlm_acc": mlm_acc, "train/sop_loss": sop_loss, "train/sop_acc": sop_acc, } all_metrics = {**train_metrics} if do_validation: val_metrics = { "val/loss": val_loss, "val/mlm_loss": val_mlm_loss, "val/mlm_acc": val_mlm_acc, "val/sop_loss": val_sop_loss, "val/sop_acc": val_sop_acc, } all_metrics = {**all_metrics, **val_metrics} if do_squad: squad_metrics = { "squad/f1": squad_f1, "squad/exact": squad_exact, } all_metrics = {**all_metrics, **squad_metrics} # Log to TensorBoard with summary_writer.as_default(): for name, val in all_metrics.items(): tf.summary.scalar(name, val, step=i) # Log to Weights & Biases if is_wandb_available(): wandb.log({"step": i, **all_metrics}) i += 1 if is_final_step: break if smddp.rank() == 0: pbar.close() logger.info(f"Finished pretraining, job name {run_name}")
def main(_): os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # Set seed to reduce randomness random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) if FLAGS.herring: import smdistributed.dataparallel.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.io.gfile.makedirs(FLAGS.output_dir) input_files = [] for input_file_dir in FLAGS.input_files_dir.split(","): input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*"))) if FLAGS.herring and len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.amp and FLAGS.manual_fp16: raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error") is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.compat.v1.ConfigProto() if FLAGS.herring: config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.rank() == 0: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT if FLAGS.amp: tf.enable_resource_variables() run_config = tf.estimator.RunConfig( tf_random_seed=(FLAGS.seed if not FLAGS.herring else (FLAGS.seed + hvd.rank())), model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None, save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.herring else FLAGS.learning_rate*hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.herring else hvd) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: training_hooks = [] if FLAGS.herring and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if (not FLAGS.herring or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.herring else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size() training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss)) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.herring else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) train_time_elapsed = time.time() - train_start_time if (not FLAGS.herring or hvd.rank() == 0): train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, FLAGS.num_train_steps * global_batch_size) tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and (not FLAGS.herring or hvd.rank() == 0): tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_files = [] for eval_file_dir in FLAGS.eval_files_dir.split(","): eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*"))) eval_input_fn = input_fn_builder( input_files=eval_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.herring else hvd) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate( input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Inference Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0" # Set seed to reduce randomness np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) hvd.init() flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') session_config = tf.ConfigProto() session_config.gpu_options.per_process_gpu_memory_fraction=0.9 session_config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.allow_xla: session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 model_dir = FLAGS.model_dir if hvd.rank() == 0 else None config = tf.estimator.RunConfig(tf_random_seed=(FLAGS.seed + hvd.rank()), model_dir=model_dir, session_config=session_config) train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, eval_count=FLAGS.eval_count, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples)) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fns = train_and_eval_dict['eval_input_fns'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] predict_input_fn = train_and_eval_dict['predict_input_fn'] train_steps = train_and_eval_dict['train_steps'] if FLAGS.checkpoint_dir: if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' # The first eval input will be evaluated. input_fn = eval_input_fns[0] if FLAGS.run_once: estimator.evaluate(input_fn, steps=None, checkpoint_path=tf.train.latest_checkpoint( FLAGS.checkpoint_dir)) else: model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn, train_steps, name) else: train_spec, eval_specs = model_lib.create_train_and_eval_specs( train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False) train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())] eval_hooks = [] for x in range(FLAGS.eval_count): estimator.train(train_input_fn, hooks=train_hooks, steps=train_steps // FLAGS.eval_count) if hvd.rank() == 0 and not FLAGS.train_only: eval_input_fn = eval_input_fns[0] results = estimator.evaluate(eval_input_fn, steps=None, hooks=eval_hooks)
parser.add_argument('--rank', type=int, default=0) # SageMaker Container environment parser.add_argument('--model_dir', type=str, default='../model') parser.add_argument('--data_dir', type=str, default='../data') args = parser.parse_args() try: args.model_dir = os.environ['SM_MODEL_DIR'] args.data_dir = os.environ['SM_CHANNEL_TRAINING'] except KeyError as e: print( "The model starts training on the local host without SageMaker TrainingJob." ) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) pass ######################################################## ####### 2. SageMaker Distributed Data Parallel ####### ####### - Get all number of GPU and rank number ####### ######################################################## args.size = smdp.size() # all number of GPU args.rank = smdp.rank() # total rank in all hosts args.local_rank = smdp.local_rank() # rank per host ######################################################## train(args)
# http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and limitations under the License. # Third Party import smdistributed.dataparallel.tensorflow as smdataparallel import tensorflow as tf # Register smdataparallel shutdown hook smdataparallel.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[smdataparallel.local_rank()], "GPU") (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( path="mnist-%d.npz" % smdataparallel.rank() ) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) ) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential( [ tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),