def train(model_name, dropout_rate, optim_name, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir): """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # instantiate training callbacks lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='val_loss', save_best_only=True) tensorboard = tf.keras.callbacks.TensorBoard( log_dir='{}/{}'.format(config.LOG_DIR, time.time())) # build model and do training model = get_training_model( model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay) model.fit( x=ds_train, steps_per_epoch=1281167 // batch_size, validation_data=ds_valid, validation_steps=50000 // batch_size, callbacks=[lrate, model_ckpt, tensorboard], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs) # training finished model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir, cross_device_ops, num_packs, tf_gpu_thread_mode): start = time.time() """Prepare data and train the model.""" if tf_gpu_thread_mode in ["global", "gpu_private", "gpu_shared"]: os.environ['TF_GPU_THREAD_MODE'] = tf_gpu_thread_mode batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) # 300 modification ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # 300 modification # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification if cross_device_ops == "HierarchicalCopyAllReduce": mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.HierarchicalCopyAllReduce( num_packs=num_packs)) elif cross_device_ops == "NcclAllReduce": mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.NcclAllReduce(num_packs=num_packs)) else: mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = get_training_model(model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, label_smoothing=label_smoothing, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay, gpus=NUM_GPU) class PrintAcc(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print(f"Epoch{epoch+1} {logs}") NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1 # train_steps = int(1281167 / batch_size) # 1000 classes # val_steps = int(50000 / batch_size) # 1000 classes # train_steps = int(383690 / batch_size) # 300 modification # val_steps = int(15000 / batch_size) # 300 modification train_steps = int(642289 / batch_size) # 500 modification val_steps = int(25000 / batch_size) # 500 modification print( f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}" ) his = model.fit( x=ds_train, steps_per_epoch=train_steps, validation_data=ds_valid, validation_steps=val_steps, callbacks=[ get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU) ], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs, verbose=2) end = time.time() fit_time = (end - start) / 3600.0 acc = 0. if len( his.history['val_top_k_categorical_accuracy'] ) < 1 else his.history['val_top_k_categorical_accuracy'][-1] print(f"[TRIAL END] time: {fit_time} {his.history}") return acc, fit_time
def main(): parser = DenseNetArgumentParser( description=( "train.py is the main training/evaluation script for DenseNet. " "In order to run training on multiple Gaudi cards, use demo_densenet.py or run " "train.py with mpirun.")) args, _ = parser.parse_known_args() strategy = None verbose = 1 os.environ['ENABLE_EXPERIMENTAL_FLAGS'] = 'true' os.environ['RUN_TPC_FUSER'] = '******' if args.deterministic: if args.inputs is None: raise ValueError("Must provide inputs for deterministic mode") if args.resume_from_checkpoint_path is None: raise ValueError("Must provide checkpoint for deterministic mode") if args.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = '1' if args.run_on_hpu: load_habana_module() if args.use_hpu_strategy: hls_addresses = str(os.environ.get( "MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if mpi_rank > 0: verbose = 0 worker_hosts = "" for address in hls_addresses: # worker_hosts: comma-separated list of worker ip:port pairs. worker_hosts = worker_hosts + ",".join( [address + ':' + str(TF_BASE_PORT + rank) for rank in range(mpi_size//len(hls_addresses))]) task_index = mpi_rank # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(worker_hosts, task_index) strategy = HPUStrategy() print('Number of devices: {}'.format( strategy.num_replicas_in_sync)) else: strategy = tf.distribute.MultiWorkerMirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) if args.seed is not None: os.environ['TF_DETERMINISTIC_OPS'] = '1' random.seed(args.seed) np.random.seed(args.seed) tf.random.set_seed(args.seed) img_rows, img_cols = 224, 224 # Resolution of inputs channel = 3 num_classes = 1000 batch_size = args.batch_size nb_epoch = args.epochs dataset_dir = args.dataset_dir resume_from_checkpoint_path = args.resume_from_checkpoint_path resume_from_epoch = args.resume_from_epoch dropout_rate = args.dropout_rate weight_decay = args.weight_decay optim_name = args.optimizer initial_lr = args.initial_lr model_name = args.model save_summary_steps = args.save_summary_steps if model_name == "densenet121": growth_rate = 32 nb_filter = 64 nb_layers = [6, 12, 24, 16] elif model_name == "densenet161": growth_rate = 48 nb_filter = 96 nb_layers = [6, 12, 36, 24] elif model_name == "densenet169": growth_rate = 32 nb_filter = 64 nb_layers = [6, 12, 32, 32] else: print("model is not supported") exit(1) # Load our model if strategy: with strategy.scope(): model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes, growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers) optimizer = get_optimizer( model_name, optim_name, initial_lr, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) else: model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes, growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers) optimizer = get_optimizer( model_name, optim_name, initial_lr, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) # Start training steps_per_epoch = 1281167 // batch_size if args.steps_per_epoch is not None: steps_per_epoch = args.steps_per_epoch validation_steps = 50000 // batch_size if args.validation_steps is not None: validation_steps = args.validation_steps warmup_steps = args.warmup_epochs * steps_per_epoch lr_sched = {0: 1, 30: 0.1, 60: 0.01, 80: 0.001} lr_sched_steps = { epoch * steps_per_epoch: multiplier for (epoch, multiplier) in lr_sched.items()} lrate = StepLearningRateScheduleWithWarmup(initial_lr=initial_lr, initial_global_step=0, warmup_steps=warmup_steps, decay_schedule=lr_sched_steps, verbose=0) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(args.model_dir, config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') callbacks = [lrate, model_ckpt] if save_summary_steps is not None and save_summary_steps > 0: log_dir = os.path.join(args.model_dir, config.LOG_DIR) local_batch_size = batch_size if args.use_hpu_strategy: log_dir = os.path.join(log_dir, 'worker_' + str(comm_rank())) local_batch_size = batch_size // strategy.num_replicas_in_sync callbacks += [ TensorBoardWithHParamsV2( args.__dict__, log_dir=log_dir, update_freq=save_summary_steps, profile_batch=0), ExamplesPerSecondKerasHookV2( save_summary_steps, output_dir=log_dir, batch_size=local_batch_size), ] if (args.evaluate_checkpoint_path is not None): model.load_weights(args.evaluate_checkpoint_path) results = model.evaluate(x=ds_valid, steps=validation_steps) print("Test loss, Test acc:", results) exit() if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)): model.load_weights(resume_from_checkpoint_path) if args.deterministic: set_deterministic() if not os.path.isfile(args.dump_config): raise FileNotFoundError("wrong dump config path") import pickle x_path = os.path.join(args.inputs, "input") y_path = os.path.join(args.inputs, "target") x = pickle.load(open(x_path, 'rb')) y = pickle.load(open(y_path, 'rb')) with dump_callback(args.dump_config): model.fit(x=x, y=y, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=False, verbose=verbose, validation_data=None, validation_steps=0, ) else: ds_train = get_dataset(dataset_dir, args.train_subset, batch_size) ds_valid = get_dataset(dataset_dir, args.val_subset, batch_size) model.fit(x=ds_train, y=None, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=True, verbose=verbose, validation_data=(ds_valid, None), validation_steps=validation_steps, validation_freq=1, )
def main(): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('--dataset', '--dataset_dir', metavar='PATH', default=config.DEFAULT_DATASET_DIR, help='Dataset directory.') parser.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'rmsprop'], help='Optimizer.') parser.add_argument('-d', '--dtype', default='fp32', choices=['fp32', 'bf16'], help='Data type.') parser.add_argument('--batch_size', type=int, default=32, help='Global batch size.') parser.add_argument('--lr_sched', default='WarmupCosine', choices=[ 'linear', 'exp', 'steps', 'constant', 'WarmupCosine'], help='Learning rate scheduler.') parser.add_argument('--initial_lr', type=float, default=6e-2, help='Initial learning rate.') parser.add_argument('--final_lr', type=float, default=1e-5, help='Final learning rate.') parser.add_argument('--warmup_steps', type=int, default=4000, help='Warmup steps.') parser.add_argument('--epochs', type=int, default=10, help='Total number of epochs for training.') parser.add_argument('--steps_per_epoch', type=int, help='Number of steps for training per epoch, overrides default value.') parser.add_argument('--validation_steps', type=int, help='Number of steps for validation, overrides default value.') parser.add_argument('--model', default='ViT-B_16', choices=['ViT-B_16', 'ViT-L_16', 'ViT-B_32', 'ViT-L_32'], help='Model.') parser.add_argument('--train_subset', default='train', help='Pattern to detect train subset in dataset directory.') parser.add_argument('--val_subset', default='validation', help='Pattern to detect validation subset in dataset directory.') parser.add_argument('--grad_accum_steps', type=int, default=8, help='Gradient accumulation steps.') parser.add_argument('--resume_from_checkpoint_path', metavar='PATH', help='Path to checkpoint to start from.') parser.add_argument('--resume_from_epoch', metavar='EPOCH_INDEX', type=int, default=0, help='Initial epoch index.') parser.add_argument('--evaluate_checkpoint_path', metavar='PATH', help='Checkpoint path for evaluating the model on --val_subset') parser.add_argument('--weights_path', metavar='PATH', help='Path to weights cache directory. ~/.keras is used if not set.') parser.add_argument('--deterministic', action='store_true', default=False, help='Enable deterministic behavior, this will also disable data augmentation. --seed must be set.') parser.add_argument('--seed', type=int, help='Seed to be used by random functions.') parser.add_argument('--device', default='HPU', choices=['CPU', 'HPU'], help='Device type.') parser.add_argument('--distributed', action='store_true', default=False, help='Enable distributed training.') parser.add_argument('--base_tf_server_port', type=int, default=7850, help='Rank 0 port used by tf.distribute.') parser.add_argument('--save_summary_steps', type=int, default=0, help='Steps between saving summaries to TensorBoard.') parser.add_argument('--recipe_cache', default='/tmp/vit_recipe_cache', help='Path to recipe cache directory. Set to empty to disable recipe cache. Externally set \'TF_RECIPE_CACHE_PATH\' will override this setting.') parser.add_argument( '--dump_config', help='Side-by-side config file. Internal, do not use.') args = parser.parse_args() if args.weights_path is not None: config.WEIGHTS_DIR = args.weights_path if args.dtype == 'bf16': tf.keras.mixed_precision.set_global_policy('mixed_bfloat16') if args.device == 'HPU': if args.distributed: os.environ['TF_HCCL_MEMORY_ALLOWANCE_MB'] = '500' from habana_frameworks.tensorflow import load_habana_module from habana_frameworks.tensorflow.ops.layer_norm import HabanaLayerNormalization load_habana_module() tf.keras.layers.LayerNormalization = HabanaLayerNormalization # Handle recipe caching. recipe_cache = args.recipe_cache if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache: os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache # Clear previous recipe cache. if not args.distributed or comm_rank() == 0: if os.path.exists(recipe_cache) and os.path.isdir(recipe_cache): import shutil shutil.rmtree(recipe_cache) # Wait for rank 0 to remove cache. if args.distributed: from mpi4py import MPI MPI.COMM_WORLD.Barrier() # Handle determinism. config.DETERMINISTIC = args.deterministic config.SEED = args.seed if args.deterministic: assert args.seed is not None, "Deterministic behavior require seed to be set." tf.config.threading.set_inter_op_parallelism_threads(1) tf.config.threading.set_intra_op_parallelism_threads(1) os.environ['TF_DETERMINISTIC_OPS'] = '1' config.DATA_AUGMENTATION = False if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) tf.random.set_seed(args.seed) # Handle distribution strategy. if args.distributed: tf_distribute_config(args.base_tf_server_port) if args.device == 'HPU': os.environ['HBN_TF_REGISTER_DATASETOPS'] = '1' from habana_frameworks.tensorflow.distribute import HPUStrategy strategy = HPUStrategy() else: strategy = tf.distribute.MultiWorkerMirroredStrategy() else: strategy = tf.distribute.OneDeviceStrategy(f'device:{args.device}:0') if not args.distributed or comm_rank() == 0: print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) num_classes = 1000 batch_size = args.batch_size nb_epoch = args.epochs dataset = args.dataset resume_from_checkpoint_path = args.resume_from_checkpoint_path resume_from_epoch = args.resume_from_epoch optim_name = args.optimizer initial_lr = args.initial_lr final_lr = args.final_lr lr_sched = args.lr_sched warmup_steps = args.warmup_steps model_name = args.model grad_accum_steps = args.grad_accum_steps ds_train = get_dataset(dataset, args.train_subset, batch_size, is_training=True, distributed=args.distributed) ds_valid = get_dataset(dataset, args.val_subset, batch_size, False, distributed=args.distributed) if args.dump_config is not None: vit.CONFIG_B['dropout'] = 0.0 vit.CONFIG_L['dropout'] = 0.0 # Load our model with strategy.scope(): image_size = 384 if model_name == 'ViT-B_16': model = vit.vit_b16( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-L_16': model = vit.vit_l16( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-B_32': model = vit.vit_b32( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-L_32': model = vit.vit_l32( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") else: print( "Model is not supported, please use either ViT-B_16 or ViT-L_16 or ViT-B_32 or ViT-L_32") exit(0) optimizer = get_optimizer( optim_name, initial_lr, accumulation_steps=grad_accum_steps, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=False) # Start training steps_per_epoch = 1281167 // batch_size if args.steps_per_epoch is not None: steps_per_epoch = args.steps_per_epoch validation_steps = 50000 // batch_size if args.validation_steps is not None: validation_steps = args.validation_steps total_steps = nb_epoch * steps_per_epoch resume_step = resume_from_epoch * steps_per_epoch lrate = get_lr_func(nb_epoch, lr_sched, initial_lr, final_lr, warmup_steps, resume_step, total_steps) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') callbacks = [lrate, model_ckpt] if args.save_summary_steps > 0: callbacks += [TensorBoardWithHParamsV2( vars(args), log_dir=config.LOG_DIR, update_freq=args.save_summary_steps)] callbacks += [ExamplesPerSecondKerasHookV2( output_dir=config.LOG_DIR, every_n_steps=args.save_summary_steps, batch_size=args.batch_size)] if (args.evaluate_checkpoint_path is not None): model.load_weights(args.evaluate_checkpoint_path) results = model.evaluate(x=ds_valid, steps=validation_steps) print("Test loss, Test acc:", results) exit() if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)): model.load_weights(resume_from_checkpoint_path) with dump_callback(args.dump_config): model.fit(x=ds_train, y=None, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=not args.deterministic, verbose=1 if not args.distributed else comm_rank() == 0, validation_data=(ds_valid, None), validation_steps=validation_steps, ) if not args.distributed or comm_rank() == 0: model.save(f'{config.SAVE_DIR}/{save_name}-model-final.h5')
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir): """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) # 300 modification ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # 300 modification # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.NcclAllReduce(num_packs=2)) # mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = get_training_model(model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, label_smoothing=label_smoothing, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay, gpus=NUM_GPU) # model = tf.keras.models.load_model("./saves/keras_save") class PrintAcc(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print( f"Epoch{epoch+1} acc#{logs.get('acc')}# val_acc#{logs.get('val_acc')} val_top_k_categorical_accuracy#{logs.get('val_top_k_categorical_accuracy')}" ) NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1 # steps = int(1281167 / batch_size / NUM_DISTRIBUTE) # train_steps = int(1281167 / batch_size) # 1000 classes # val_steps = int(50000 / batch_size) # 1000 classes # train_steps = int(383690 / batch_size) # 300 modification # val_steps = int(15000 / batch_size) # 300 modification train_steps = int(642289 / batch_size) # 500 modification val_steps = int(25000 / batch_size) # 500 modification # steps = int(192439 / batch_size / NUM_DISTRIBUTE) # 600 modification print( f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}" ) his = model.fit( x=ds_train, steps_per_epoch=train_steps, validation_data=ds_valid, validation_steps=val_steps, callbacks=[ get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU) ], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs, verbose=2) # print(his.history) final_acc = 0. if len( his.history['val_top_k_categorical_accuracy'] ) < 1 else his.history['val_top_k_categorical_accuracy'][-1] print(f"Final acc:{final_acc}") nni.report_final_result(final_acc)
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, iterations, dataset_dir, skip_eval, eval_checkpoint, run_on_hpu, measure_perf, extract_tensors_cfg_file_path, bfloat16, train_subset, val_subset): if not run_on_hpu: strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, train_subset, batch_size) if skip_eval: ds_valid = None else: ds_valid = get_dataset(dataset_dir, val_subset, batch_size) # instantiate training callbacks lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') tensorboard = tf.keras.callbacks.TensorBoard( log_dir='{}/{}'.format(config.LOG_DIR, time.time())) if iterations: steps_per_epoch = iterations print(f"Changing steps per epoch to {steps_per_epoch}") else: steps_per_epoch = 1281167 // batch_size if skip_eval: val_steps = 0 else: val_steps = 50000 // batch_size # build model and do training get_training_model_kwargs = { "model_name": model_name, "dropout_rate": dropout_rate, "optimizer": optimizer, "label_smoothing": label_smoothing, "use_lookahead": use_lookahead, "iter_size": iter_size, "weight_decay": weight_decay, "batch_size": batch_size } if not run_on_hpu: with strategy.scope(): model = get_training_model(**get_training_model_kwargs) else: if bfloat16: # Bf16 conversion, full list os.environ['TF_ENABLE_BF16_CONVERSION'] = 'full' else: os.environ['TF_ENABLE_BF16_CONVERSION'] = "false" print("train: Set TF_ENABLE_BF16_CONVERSION: " + os.environ.get('TF_ENABLE_BF16_CONVERSION')) model = get_training_model(**get_training_model_kwargs) if eval_checkpoint != None: model.load_weights(eval_checkpoint) results = model.evaluate(x=ds_valid, steps=val_steps) print("Test loss, Test acc:", results) exit() x = ds_train y = None callbacks = [lrate, model_ckpt] shuffle = True if measure_perf: callbacks += [KerasMeasurePerfCallback(model, batch_size)] if extract_tensors_cfg_file_path != None: tenorsExtractionCallback = KerasTensorExtractionCallback( model, extract_tensors_cfg_file_path) callbacks += [tenorsExtractionCallback] x = tenorsExtractionCallback.get_input() y = tenorsExtractionCallback.get_target() steps_per_epoch = 1 epochs = 1 ds_valid = None val_steps = 0 shuffle = False model.fit(x=x, y=y, steps_per_epoch=steps_per_epoch, validation_data=ds_valid, validation_steps=val_steps, callbacks=callbacks, epochs=epochs, shuffle=shuffle) # training finished model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))