def train(model_name, dropout_rate, optim_name, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir): """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # instantiate training callbacks lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='val_loss', save_best_only=True) tensorboard = tf.keras.callbacks.TensorBoard( log_dir='{}/{}'.format(config.LOG_DIR, time.time())) # build model and do training model = get_training_model( model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay) model.fit( x=ds_train, steps_per_epoch=1281167 // batch_size, validation_data=ds_valid, validation_steps=50000 // batch_size, callbacks=[lrate, model_ckpt, tensorboard], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs) # training finished model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir, cross_device_ops, num_packs, tf_gpu_thread_mode): start = time.time() """Prepare data and train the model.""" if tf_gpu_thread_mode in ["global", "gpu_private", "gpu_shared"]: os.environ['TF_GPU_THREAD_MODE'] = tf_gpu_thread_mode batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) # 300 modification ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # 300 modification # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification if cross_device_ops == "HierarchicalCopyAllReduce": mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.HierarchicalCopyAllReduce( num_packs=num_packs)) elif cross_device_ops == "NcclAllReduce": mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.NcclAllReduce(num_packs=num_packs)) else: mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = get_training_model(model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, label_smoothing=label_smoothing, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay, gpus=NUM_GPU) class PrintAcc(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print(f"Epoch{epoch+1} {logs}") NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1 # train_steps = int(1281167 / batch_size) # 1000 classes # val_steps = int(50000 / batch_size) # 1000 classes # train_steps = int(383690 / batch_size) # 300 modification # val_steps = int(15000 / batch_size) # 300 modification train_steps = int(642289 / batch_size) # 500 modification val_steps = int(25000 / batch_size) # 500 modification print( f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}" ) his = model.fit( x=ds_train, steps_per_epoch=train_steps, validation_data=ds_valid, validation_steps=val_steps, callbacks=[ get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU) ], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs, verbose=2) end = time.time() fit_time = (end - start) / 3600.0 acc = 0. if len( his.history['val_top_k_categorical_accuracy'] ) < 1 else his.history['val_top_k_categorical_accuracy'][-1] print(f"[TRIAL END] time: {fit_time} {his.history}") return acc, fit_time
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, dataset_dir): """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, 'train', batch_size) # 300 modification ds_valid = get_dataset(dataset_dir, 'validation', batch_size) # 300 modification # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification mirrored_strategy = tf.distribute.MirroredStrategy( cross_device_ops=tf.distribute.NcclAllReduce(num_packs=2)) # mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = get_training_model(model_name=model_name, dropout_rate=dropout_rate, optimizer=optimizer, label_smoothing=label_smoothing, use_lookahead=use_lookahead, iter_size=iter_size, weight_decay=weight_decay, gpus=NUM_GPU) # model = tf.keras.models.load_model("./saves/keras_save") class PrintAcc(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): print( f"Epoch{epoch+1} acc#{logs.get('acc')}# val_acc#{logs.get('val_acc')} val_top_k_categorical_accuracy#{logs.get('val_top_k_categorical_accuracy')}" ) NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1 # steps = int(1281167 / batch_size / NUM_DISTRIBUTE) # train_steps = int(1281167 / batch_size) # 1000 classes # val_steps = int(50000 / batch_size) # 1000 classes # train_steps = int(383690 / batch_size) # 300 modification # val_steps = int(15000 / batch_size) # 300 modification train_steps = int(642289 / batch_size) # 500 modification val_steps = int(25000 / batch_size) # 500 modification # steps = int(192439 / batch_size / NUM_DISTRIBUTE) # 600 modification print( f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}" ) his = model.fit( x=ds_train, steps_per_epoch=train_steps, validation_data=ds_valid, validation_steps=val_steps, callbacks=[ get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU) ], # The following doesn't seem to help in terms of speed. # use_multiprocessing=True, workers=4, epochs=epochs, verbose=2) # print(his.history) final_acc = 0. if len( his.history['val_top_k_categorical_accuracy'] ) < 1 else his.history['val_top_k_categorical_accuracy'][-1] print(f"Final acc:{final_acc}") nni.report_final_result(final_acc)
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing, use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr, weight_decay, epochs, iterations, dataset_dir, skip_eval, eval_checkpoint, run_on_hpu, measure_perf, extract_tensors_cfg_file_path, bfloat16, train_subset, val_subset): if not run_on_hpu: strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) """Prepare data and train the model.""" batch_size = get_batch_size(model_name, batch_size) iter_size = get_iter_size(model_name, iter_size) initial_lr = get_initial_lr(model_name, initial_lr) final_lr = get_final_lr(model_name, final_lr) optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon) weight_decay = get_weight_decay(model_name, weight_decay) # get training and validation data ds_train = get_dataset(dataset_dir, train_subset, batch_size) if skip_eval: ds_valid = None else: ds_valid = get_dataset(dataset_dir, val_subset, batch_size) # instantiate training callbacks lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') tensorboard = tf.keras.callbacks.TensorBoard( log_dir='{}/{}'.format(config.LOG_DIR, time.time())) if iterations: steps_per_epoch = iterations print(f"Changing steps per epoch to {steps_per_epoch}") else: steps_per_epoch = 1281167 // batch_size if skip_eval: val_steps = 0 else: val_steps = 50000 // batch_size # build model and do training get_training_model_kwargs = { "model_name": model_name, "dropout_rate": dropout_rate, "optimizer": optimizer, "label_smoothing": label_smoothing, "use_lookahead": use_lookahead, "iter_size": iter_size, "weight_decay": weight_decay, "batch_size": batch_size } if not run_on_hpu: with strategy.scope(): model = get_training_model(**get_training_model_kwargs) else: if bfloat16: # Bf16 conversion, full list os.environ['TF_ENABLE_BF16_CONVERSION'] = 'full' else: os.environ['TF_ENABLE_BF16_CONVERSION'] = "false" print("train: Set TF_ENABLE_BF16_CONVERSION: " + os.environ.get('TF_ENABLE_BF16_CONVERSION')) model = get_training_model(**get_training_model_kwargs) if eval_checkpoint != None: model.load_weights(eval_checkpoint) results = model.evaluate(x=ds_valid, steps=val_steps) print("Test loss, Test acc:", results) exit() x = ds_train y = None callbacks = [lrate, model_ckpt] shuffle = True if measure_perf: callbacks += [KerasMeasurePerfCallback(model, batch_size)] if extract_tensors_cfg_file_path != None: tenorsExtractionCallback = KerasTensorExtractionCallback( model, extract_tensors_cfg_file_path) callbacks += [tenorsExtractionCallback] x = tenorsExtractionCallback.get_input() y = tenorsExtractionCallback.get_target() steps_per_epoch = 1 epochs = 1 ds_valid = None val_steps = 0 shuffle = False model.fit(x=x, y=y, steps_per_epoch=steps_per_epoch, validation_data=ds_valid, validation_steps=val_steps, callbacks=callbacks, epochs=epochs, shuffle=shuffle) # training finished model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))