Beispiel #1
0
def train(model_name, dropout_rate, optim_name,
          use_lookahead, batch_size, iter_size,
          lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir):
    """Prepare data and train the model."""
    batch_size   = get_batch_size(model_name, batch_size)
    iter_size    = get_iter_size(model_name, iter_size)
    initial_lr   = get_initial_lr(model_name, initial_lr)
    final_lr     = get_final_lr(model_name, final_lr)
    optimizer    = get_optimizer(model_name, optim_name, initial_lr)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train', batch_size)
    ds_valid = get_dataset(dataset_dir, 'validation', batch_size)

    # instantiate training callbacks
    lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr)
    save_name = model_name if not model_name.endswith('.h5') else \
                os.path.split(model_name)[-1].split('.')[0].split('-')[0]
    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
        monitor='val_loss',
        save_best_only=True)
    tensorboard = tf.keras.callbacks.TensorBoard(
        log_dir='{}/{}'.format(config.LOG_DIR, time.time()))

    # build model and do training
    model = get_training_model(
        model_name=model_name,
        dropout_rate=dropout_rate,
        optimizer=optimizer,
        use_lookahead=use_lookahead,
        iter_size=iter_size,
        weight_decay=weight_decay)
    model.fit(
        x=ds_train,
        steps_per_epoch=1281167 // batch_size,
        validation_data=ds_valid,
        validation_steps=50000 // batch_size,
        callbacks=[lrate, model_ckpt, tensorboard],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs)

    # training finished
    model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))
Beispiel #2
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir, cross_device_ops, num_packs,
          tf_gpu_thread_mode):
    start = time.time()
    """Prepare data and train the model."""
    if tf_gpu_thread_mode in ["global", "gpu_private", "gpu_shared"]:
        os.environ['TF_GPU_THREAD_MODE'] = tf_gpu_thread_mode
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)
    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train',
                           batch_size)  # 300 modification
    ds_valid = get_dataset(dataset_dir, 'validation',
                           batch_size)  # 300 modification
    # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification
    # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification
    if cross_device_ops == "HierarchicalCopyAllReduce":
        mirrored_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=tf.distribute.HierarchicalCopyAllReduce(
                num_packs=num_packs))
    elif cross_device_ops == "NcclAllReduce":
        mirrored_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=tf.distribute.NcclAllReduce(num_packs=num_packs))
    else:
        mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = get_training_model(model_name=model_name,
                                   dropout_rate=dropout_rate,
                                   optimizer=optimizer,
                                   label_smoothing=label_smoothing,
                                   use_lookahead=use_lookahead,
                                   iter_size=iter_size,
                                   weight_decay=weight_decay,
                                   gpus=NUM_GPU)

    class PrintAcc(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            print(f"Epoch{epoch+1} {logs}")

    NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1
    # train_steps = int(1281167 / batch_size) # 1000 classes
    # val_steps = int(50000 / batch_size) # 1000 classes
    # train_steps = int(383690 / batch_size) # 300 modification
    # val_steps = int(15000 / batch_size) # 300 modification
    train_steps = int(642289 / batch_size)  # 500 modification
    val_steps = int(25000 / batch_size)  # 500 modification
    print(
        f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}"
    )
    his = model.fit(
        x=ds_train,
        steps_per_epoch=train_steps,
        validation_data=ds_valid,
        validation_steps=val_steps,
        callbacks=[
            get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU)
        ],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs,
        verbose=2)

    end = time.time()
    fit_time = (end - start) / 3600.0
    acc = 0. if len(
        his.history['val_top_k_categorical_accuracy']
    ) < 1 else his.history['val_top_k_categorical_accuracy'][-1]
    print(f"[TRIAL END] time: {fit_time} {his.history}")
    return acc, fit_time
Beispiel #3
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir):
    """Prepare data and train the model."""
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train',
                           batch_size)  # 300 modification
    ds_valid = get_dataset(dataset_dir, 'validation',
                           batch_size)  # 300 modification
    # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification
    # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification
    mirrored_strategy = tf.distribute.MirroredStrategy(
        cross_device_ops=tf.distribute.NcclAllReduce(num_packs=2))
    # mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = get_training_model(model_name=model_name,
                                   dropout_rate=dropout_rate,
                                   optimizer=optimizer,
                                   label_smoothing=label_smoothing,
                                   use_lookahead=use_lookahead,
                                   iter_size=iter_size,
                                   weight_decay=weight_decay,
                                   gpus=NUM_GPU)
    # model = tf.keras.models.load_model("./saves/keras_save")

    class PrintAcc(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            print(
                f"Epoch{epoch+1} acc#{logs.get('acc')}# val_acc#{logs.get('val_acc')} val_top_k_categorical_accuracy#{logs.get('val_top_k_categorical_accuracy')}"
            )

    NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1
    # steps = int(1281167  / batch_size / NUM_DISTRIBUTE)
    # train_steps = int(1281167 / batch_size) # 1000 classes
    # val_steps = int(50000 / batch_size) # 1000 classes
    # train_steps = int(383690 / batch_size) # 300 modification
    # val_steps = int(15000 / batch_size) # 300 modification
    train_steps = int(642289 / batch_size)  # 500 modification
    val_steps = int(25000 / batch_size)  # 500 modification
    # steps = int(192439 / batch_size / NUM_DISTRIBUTE) # 600 modification
    print(
        f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}"
    )
    his = model.fit(
        x=ds_train,
        steps_per_epoch=train_steps,
        validation_data=ds_valid,
        validation_steps=val_steps,
        callbacks=[
            get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU)
        ],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs,
        verbose=2)

    # print(his.history)
    final_acc = 0. if len(
        his.history['val_top_k_categorical_accuracy']
    ) < 1 else his.history['val_top_k_categorical_accuracy'][-1]
    print(f"Final acc:{final_acc}")
    nni.report_final_result(final_acc)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.add_argument('--dataset', '--dataset_dir', metavar='PATH',
                        default=config.DEFAULT_DATASET_DIR, help='Dataset directory.')
    parser.add_argument('--optimizer', default='sgd',
                        choices=['sgd', 'adam', 'rmsprop'], help='Optimizer.')
    parser.add_argument('-d', '--dtype', default='fp32',
                        choices=['fp32', 'bf16'], help='Data type.')
    parser.add_argument('--batch_size', type=int,
                        default=32, help='Global batch size.')
    parser.add_argument('--lr_sched', default='WarmupCosine', choices=[
                        'linear', 'exp', 'steps', 'constant', 'WarmupCosine'], help='Learning rate scheduler.')
    parser.add_argument('--initial_lr', type=float,
                        default=6e-2, help='Initial learning rate.')
    parser.add_argument('--final_lr', type=float,
                        default=1e-5, help='Final learning rate.')
    parser.add_argument('--warmup_steps', type=int,
                        default=4000, help='Warmup steps.')
    parser.add_argument('--epochs', type=int, default=10,
                        help='Total number of epochs for training.')
    parser.add_argument('--steps_per_epoch', type=int,
                        help='Number of steps for training per epoch, overrides default value.')
    parser.add_argument('--validation_steps', type=int,
                        help='Number of steps for validation, overrides default value.')
    parser.add_argument('--model', default='ViT-B_16',
                        choices=['ViT-B_16', 'ViT-L_16', 'ViT-B_32', 'ViT-L_32'], help='Model.')
    parser.add_argument('--train_subset', default='train',
                        help='Pattern to detect train subset in dataset directory.')
    parser.add_argument('--val_subset', default='validation',
                        help='Pattern to detect validation subset in dataset directory.')
    parser.add_argument('--grad_accum_steps', type=int,
                        default=8, help='Gradient accumulation steps.')
    parser.add_argument('--resume_from_checkpoint_path',
                        metavar='PATH', help='Path to checkpoint to start from.')
    parser.add_argument('--resume_from_epoch', metavar='EPOCH_INDEX',
                        type=int, default=0, help='Initial epoch index.')
    parser.add_argument('--evaluate_checkpoint_path', metavar='PATH',
                        help='Checkpoint path for evaluating the model on --val_subset')
    parser.add_argument('--weights_path', metavar='PATH',
                        help='Path to weights cache directory. ~/.keras is used if not set.')
    parser.add_argument('--deterministic', action='store_true', default=False,
                        help='Enable deterministic behavior, this will also disable data augmentation. --seed must be set.')
    parser.add_argument('--seed', type=int,
                        help='Seed to be used by random functions.')
    parser.add_argument('--device', default='HPU',
                        choices=['CPU', 'HPU'], help='Device type.')
    parser.add_argument('--distributed', action='store_true',
                        default=False, help='Enable distributed training.')
    parser.add_argument('--base_tf_server_port', type=int,
                        default=7850, help='Rank 0 port used by tf.distribute.')
    parser.add_argument('--save_summary_steps', type=int, default=0,
                        help='Steps between saving summaries to TensorBoard.')
    parser.add_argument('--recipe_cache', default='/tmp/vit_recipe_cache',
                        help='Path to recipe cache directory. Set to empty to disable recipe cache. Externally set \'TF_RECIPE_CACHE_PATH\' will override this setting.')
    parser.add_argument(
        '--dump_config', help='Side-by-side config file. Internal, do not use.')
    args = parser.parse_args()

    if args.weights_path is not None:
        config.WEIGHTS_DIR = args.weights_path

    if args.dtype == 'bf16':
        tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

    if args.device == 'HPU':
        if args.distributed:
            os.environ['TF_HCCL_MEMORY_ALLOWANCE_MB'] = '500'
        from habana_frameworks.tensorflow import load_habana_module
        from habana_frameworks.tensorflow.ops.layer_norm import HabanaLayerNormalization
        load_habana_module()
        tf.keras.layers.LayerNormalization = HabanaLayerNormalization

        # Handle recipe caching.
        recipe_cache = args.recipe_cache
        if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache:
            os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache

        # Clear previous recipe cache.
        if not args.distributed or comm_rank() == 0:
            if os.path.exists(recipe_cache) and os.path.isdir(recipe_cache):
                import shutil
                shutil.rmtree(recipe_cache)
        # Wait for rank 0 to remove cache.
        if args.distributed:
            from mpi4py import MPI
            MPI.COMM_WORLD.Barrier()

    # Handle determinism.
    config.DETERMINISTIC = args.deterministic
    config.SEED = args.seed
    if args.deterministic:
        assert args.seed is not None, "Deterministic behavior require seed to be set."
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        config.DATA_AUGMENTATION = False
    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        tf.random.set_seed(args.seed)

    # Handle distribution strategy.
    if args.distributed:
        tf_distribute_config(args.base_tf_server_port)
        if args.device == 'HPU':
            os.environ['HBN_TF_REGISTER_DATASETOPS'] = '1'
            from habana_frameworks.tensorflow.distribute import HPUStrategy
            strategy = HPUStrategy()
        else:
            strategy = tf.distribute.MultiWorkerMirroredStrategy()
    else:
        strategy = tf.distribute.OneDeviceStrategy(f'device:{args.device}:0')

    if not args.distributed or comm_rank() == 0:
        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

    num_classes = 1000
    batch_size = args.batch_size
    nb_epoch = args.epochs
    dataset = args.dataset
    resume_from_checkpoint_path = args.resume_from_checkpoint_path
    resume_from_epoch = args.resume_from_epoch
    optim_name = args.optimizer
    initial_lr = args.initial_lr
    final_lr = args.final_lr
    lr_sched = args.lr_sched
    warmup_steps = args.warmup_steps
    model_name = args.model
    grad_accum_steps = args.grad_accum_steps

    ds_train = get_dataset(dataset, args.train_subset, batch_size,
                           is_training=True, distributed=args.distributed)
    ds_valid = get_dataset(dataset, args.val_subset,
                           batch_size, False, distributed=args.distributed)

    if args.dump_config is not None:
        vit.CONFIG_B['dropout'] = 0.0
        vit.CONFIG_L['dropout'] = 0.0

    # Load our model
    with strategy.scope():
        image_size = 384
        if model_name == 'ViT-B_16':
            model = vit.vit_b16(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-L_16':
            model = vit.vit_l16(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-B_32':
            model = vit.vit_b32(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        elif model_name == 'ViT-L_32':
            model = vit.vit_l32(
                image_size=image_size,
                activation='softmax',
                pretrained=True,
                include_top=True,
                pretrained_top=False,
                classes=num_classes,
                weights="imagenet21k")
        else:
            print(
                "Model is not supported, please use either ViT-B_16 or ViT-L_16 or ViT-B_32 or ViT-L_32")
            exit(0)

        optimizer = get_optimizer(
            optim_name, initial_lr, accumulation_steps=grad_accum_steps, epsilon=1e-2)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy',
                      metrics=['accuracy'], run_eagerly=False)

        # Start training

        steps_per_epoch = 1281167 // batch_size
        if args.steps_per_epoch is not None:
            steps_per_epoch = args.steps_per_epoch
        validation_steps = 50000 // batch_size
        if args.validation_steps is not None:
            validation_steps = args.validation_steps

        total_steps = nb_epoch * steps_per_epoch
        resume_step = resume_from_epoch * steps_per_epoch

        lrate = get_lr_func(nb_epoch, lr_sched, initial_lr,
                            final_lr, warmup_steps, resume_step, total_steps)

        save_name = model_name if not model_name.endswith('.h5') else \
            os.path.split(model_name)[-1].split('.')[0].split('-')[0]
        model_ckpt = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
            monitor='train_loss')

        callbacks = [lrate, model_ckpt]
        if args.save_summary_steps > 0:
            callbacks += [TensorBoardWithHParamsV2(
                vars(args), log_dir=config.LOG_DIR, update_freq=args.save_summary_steps)]
            callbacks += [ExamplesPerSecondKerasHookV2(
                output_dir=config.LOG_DIR, every_n_steps=args.save_summary_steps, batch_size=args.batch_size)]

        if (args.evaluate_checkpoint_path is not None):
            model.load_weights(args.evaluate_checkpoint_path)
            results = model.evaluate(x=ds_valid, steps=validation_steps)
            print("Test loss, Test acc:", results)
            exit()

        if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)):
            model.load_weights(resume_from_checkpoint_path)

        with dump_callback(args.dump_config):
            model.fit(x=ds_train, y=None,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks,
                      initial_epoch=resume_from_epoch,
                      epochs=nb_epoch,
                      shuffle=not args.deterministic,
                      verbose=1 if not args.distributed else comm_rank() == 0,
                      validation_data=(ds_valid, None),
                      validation_steps=validation_steps,
                      )

        if not args.distributed or comm_rank() == 0:
            model.save(f'{config.SAVE_DIR}/{save_name}-model-final.h5')
Beispiel #5
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, iterations, dataset_dir, skip_eval,
          eval_checkpoint, run_on_hpu, measure_perf,
          extract_tensors_cfg_file_path, bfloat16, train_subset, val_subset):

    if not run_on_hpu:
        strategy = tf.distribute.MirroredStrategy()
        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    """Prepare data and train the model."""
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, train_subset, batch_size)
    if skip_eval:
        ds_valid = None
    else:
        ds_valid = get_dataset(dataset_dir, val_subset, batch_size)

    # instantiate training callbacks
    lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr)
    save_name = model_name if not model_name.endswith('.h5') else \
        os.path.split(model_name)[-1].split('.')[0].split('-')[0]
    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
        monitor='train_loss')
    tensorboard = tf.keras.callbacks.TensorBoard(
        log_dir='{}/{}'.format(config.LOG_DIR, time.time()))

    if iterations:
        steps_per_epoch = iterations
        print(f"Changing steps per epoch to {steps_per_epoch}")
    else:
        steps_per_epoch = 1281167 // batch_size

    if skip_eval:
        val_steps = 0
    else:
        val_steps = 50000 // batch_size

    # build model and do training
    get_training_model_kwargs = {
        "model_name": model_name,
        "dropout_rate": dropout_rate,
        "optimizer": optimizer,
        "label_smoothing": label_smoothing,
        "use_lookahead": use_lookahead,
        "iter_size": iter_size,
        "weight_decay": weight_decay,
        "batch_size": batch_size
    }

    if not run_on_hpu:
        with strategy.scope():
            model = get_training_model(**get_training_model_kwargs)
    else:
        if bfloat16:
            # Bf16 conversion, full list
            os.environ['TF_ENABLE_BF16_CONVERSION'] = 'full'
        else:
            os.environ['TF_ENABLE_BF16_CONVERSION'] = "false"

        print("train: Set TF_ENABLE_BF16_CONVERSION: " +
              os.environ.get('TF_ENABLE_BF16_CONVERSION'))
        model = get_training_model(**get_training_model_kwargs)

    if eval_checkpoint != None:
        model.load_weights(eval_checkpoint)
        results = model.evaluate(x=ds_valid, steps=val_steps)
        print("Test loss, Test acc:", results)
        exit()

    x = ds_train
    y = None
    callbacks = [lrate, model_ckpt]
    shuffle = True
    if measure_perf:
        callbacks += [KerasMeasurePerfCallback(model, batch_size)]

    if extract_tensors_cfg_file_path != None:
        tenorsExtractionCallback = KerasTensorExtractionCallback(
            model, extract_tensors_cfg_file_path)
        callbacks += [tenorsExtractionCallback]
        x = tenorsExtractionCallback.get_input()
        y = tenorsExtractionCallback.get_target()
        steps_per_epoch = 1
        epochs = 1
        ds_valid = None
        val_steps = 0
        shuffle = False

    model.fit(x=x,
              y=y,
              steps_per_epoch=steps_per_epoch,
              validation_data=ds_valid,
              validation_steps=val_steps,
              callbacks=callbacks,
              epochs=epochs,
              shuffle=shuffle)

    # training finished
    model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))