Exemple #1
0
def train(model_name, dropout_rate, optim_name,
          use_lookahead, batch_size, iter_size,
          lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir):
    """Prepare data and train the model."""
    batch_size   = get_batch_size(model_name, batch_size)
    iter_size    = get_iter_size(model_name, iter_size)
    initial_lr   = get_initial_lr(model_name, initial_lr)
    final_lr     = get_final_lr(model_name, final_lr)
    optimizer    = get_optimizer(model_name, optim_name, initial_lr)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train', batch_size)
    ds_valid = get_dataset(dataset_dir, 'validation', batch_size)

    # instantiate training callbacks
    lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr)
    save_name = model_name if not model_name.endswith('.h5') else \
                os.path.split(model_name)[-1].split('.')[0].split('-')[0]
    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
        monitor='val_loss',
        save_best_only=True)
    tensorboard = tf.keras.callbacks.TensorBoard(
        log_dir='{}/{}'.format(config.LOG_DIR, time.time()))

    # build model and do training
    model = get_training_model(
        model_name=model_name,
        dropout_rate=dropout_rate,
        optimizer=optimizer,
        use_lookahead=use_lookahead,
        iter_size=iter_size,
        weight_decay=weight_decay)
    model.fit(
        x=ds_train,
        steps_per_epoch=1281167 // batch_size,
        validation_data=ds_valid,
        validation_steps=50000 // batch_size,
        callbacks=[lrate, model_ckpt, tensorboard],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs)

    # training finished
    model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))
Exemple #2
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir, cross_device_ops, num_packs,
          tf_gpu_thread_mode):
    start = time.time()
    """Prepare data and train the model."""
    if tf_gpu_thread_mode in ["global", "gpu_private", "gpu_shared"]:
        os.environ['TF_GPU_THREAD_MODE'] = tf_gpu_thread_mode
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)
    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train',
                           batch_size)  # 300 modification
    ds_valid = get_dataset(dataset_dir, 'validation',
                           batch_size)  # 300 modification
    # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification
    # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification
    if cross_device_ops == "HierarchicalCopyAllReduce":
        mirrored_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=tf.distribute.HierarchicalCopyAllReduce(
                num_packs=num_packs))
    elif cross_device_ops == "NcclAllReduce":
        mirrored_strategy = tf.distribute.MirroredStrategy(
            cross_device_ops=tf.distribute.NcclAllReduce(num_packs=num_packs))
    else:
        mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = get_training_model(model_name=model_name,
                                   dropout_rate=dropout_rate,
                                   optimizer=optimizer,
                                   label_smoothing=label_smoothing,
                                   use_lookahead=use_lookahead,
                                   iter_size=iter_size,
                                   weight_decay=weight_decay,
                                   gpus=NUM_GPU)

    class PrintAcc(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            print(f"Epoch{epoch+1} {logs}")

    NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1
    # train_steps = int(1281167 / batch_size) # 1000 classes
    # val_steps = int(50000 / batch_size) # 1000 classes
    # train_steps = int(383690 / batch_size) # 300 modification
    # val_steps = int(15000 / batch_size) # 300 modification
    train_steps = int(642289 / batch_size)  # 500 modification
    val_steps = int(25000 / batch_size)  # 500 modification
    print(
        f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}"
    )
    his = model.fit(
        x=ds_train,
        steps_per_epoch=train_steps,
        validation_data=ds_valid,
        validation_steps=val_steps,
        callbacks=[
            get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU)
        ],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs,
        verbose=2)

    end = time.time()
    fit_time = (end - start) / 3600.0
    acc = 0. if len(
        his.history['val_top_k_categorical_accuracy']
    ) < 1 else his.history['val_top_k_categorical_accuracy'][-1]
    print(f"[TRIAL END] time: {fit_time} {his.history}")
    return acc, fit_time
Exemple #3
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, dataset_dir):
    """Prepare data and train the model."""
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, 'train',
                           batch_size)  # 300 modification
    ds_valid = get_dataset(dataset_dir, 'validation',
                           batch_size)  # 300 modification
    # ds_train = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_train/", 'train', batch_size) # 1000 modification
    # ds_valid = get_dataset("/lustre/project/EricLo/cx/imagenet/imagenet_1000classes_val/", 'validation', batch_size) # 1000 modification
    mirrored_strategy = tf.distribute.MirroredStrategy(
        cross_device_ops=tf.distribute.NcclAllReduce(num_packs=2))
    # mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        model = get_training_model(model_name=model_name,
                                   dropout_rate=dropout_rate,
                                   optimizer=optimizer,
                                   label_smoothing=label_smoothing,
                                   use_lookahead=use_lookahead,
                                   iter_size=iter_size,
                                   weight_decay=weight_decay,
                                   gpus=NUM_GPU)
    # model = tf.keras.models.load_model("./saves/keras_save")

    class PrintAcc(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            print(
                f"Epoch{epoch+1} acc#{logs.get('acc')}# val_acc#{logs.get('val_acc')} val_top_k_categorical_accuracy#{logs.get('val_top_k_categorical_accuracy')}"
            )

    NUM_DISTRIBUTE = NUM_GPU if NUM_GPU > 0 else 1
    # steps = int(1281167  / batch_size / NUM_DISTRIBUTE)
    # train_steps = int(1281167 / batch_size) # 1000 classes
    # val_steps = int(50000 / batch_size) # 1000 classes
    # train_steps = int(383690 / batch_size) # 300 modification
    # val_steps = int(15000 / batch_size) # 300 modification
    train_steps = int(642289 / batch_size)  # 500 modification
    val_steps = int(25000 / batch_size)  # 500 modification
    # steps = int(192439 / batch_size / NUM_DISTRIBUTE) # 600 modification
    print(
        f"[INFO] Total Epochs:{epochs} Train Steps:{train_steps} Validate Steps: {val_steps} Workers:{NUM_DISTRIBUTE} Batch size:{batch_size}"
    )
    his = model.fit(
        x=ds_train,
        steps_per_epoch=train_steps,
        validation_data=ds_valid,
        validation_steps=val_steps,
        callbacks=[
            get_lr_func(epochs, lr_sched, initial_lr, final_lr, NUM_GPU)
        ],
        # The following doesn't seem to help in terms of speed.
        # use_multiprocessing=True, workers=4,
        epochs=epochs,
        verbose=2)

    # print(his.history)
    final_acc = 0. if len(
        his.history['val_top_k_categorical_accuracy']
    ) < 1 else his.history['val_top_k_categorical_accuracy'][-1]
    print(f"Final acc:{final_acc}")
    nni.report_final_result(final_acc)
Exemple #4
0
def train(model_name, dropout_rate, optim_name, epsilon, label_smoothing,
          use_lookahead, batch_size, iter_size, lr_sched, initial_lr, final_lr,
          weight_decay, epochs, iterations, dataset_dir, skip_eval,
          eval_checkpoint, run_on_hpu, measure_perf,
          extract_tensors_cfg_file_path, bfloat16, train_subset, val_subset):

    if not run_on_hpu:
        strategy = tf.distribute.MirroredStrategy()
        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    """Prepare data and train the model."""
    batch_size = get_batch_size(model_name, batch_size)
    iter_size = get_iter_size(model_name, iter_size)
    initial_lr = get_initial_lr(model_name, initial_lr)
    final_lr = get_final_lr(model_name, final_lr)
    optimizer = get_optimizer(model_name, optim_name, initial_lr, epsilon)
    weight_decay = get_weight_decay(model_name, weight_decay)

    # get training and validation data
    ds_train = get_dataset(dataset_dir, train_subset, batch_size)
    if skip_eval:
        ds_valid = None
    else:
        ds_valid = get_dataset(dataset_dir, val_subset, batch_size)

    # instantiate training callbacks
    lrate = get_lr_func(epochs, lr_sched, initial_lr, final_lr)
    save_name = model_name if not model_name.endswith('.h5') else \
        os.path.split(model_name)[-1].split('.')[0].split('-')[0]
    model_ckpt = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5',
        monitor='train_loss')
    tensorboard = tf.keras.callbacks.TensorBoard(
        log_dir='{}/{}'.format(config.LOG_DIR, time.time()))

    if iterations:
        steps_per_epoch = iterations
        print(f"Changing steps per epoch to {steps_per_epoch}")
    else:
        steps_per_epoch = 1281167 // batch_size

    if skip_eval:
        val_steps = 0
    else:
        val_steps = 50000 // batch_size

    # build model and do training
    get_training_model_kwargs = {
        "model_name": model_name,
        "dropout_rate": dropout_rate,
        "optimizer": optimizer,
        "label_smoothing": label_smoothing,
        "use_lookahead": use_lookahead,
        "iter_size": iter_size,
        "weight_decay": weight_decay,
        "batch_size": batch_size
    }

    if not run_on_hpu:
        with strategy.scope():
            model = get_training_model(**get_training_model_kwargs)
    else:
        if bfloat16:
            # Bf16 conversion, full list
            os.environ['TF_ENABLE_BF16_CONVERSION'] = 'full'
        else:
            os.environ['TF_ENABLE_BF16_CONVERSION'] = "false"

        print("train: Set TF_ENABLE_BF16_CONVERSION: " +
              os.environ.get('TF_ENABLE_BF16_CONVERSION'))
        model = get_training_model(**get_training_model_kwargs)

    if eval_checkpoint != None:
        model.load_weights(eval_checkpoint)
        results = model.evaluate(x=ds_valid, steps=val_steps)
        print("Test loss, Test acc:", results)
        exit()

    x = ds_train
    y = None
    callbacks = [lrate, model_ckpt]
    shuffle = True
    if measure_perf:
        callbacks += [KerasMeasurePerfCallback(model, batch_size)]

    if extract_tensors_cfg_file_path != None:
        tenorsExtractionCallback = KerasTensorExtractionCallback(
            model, extract_tensors_cfg_file_path)
        callbacks += [tenorsExtractionCallback]
        x = tenorsExtractionCallback.get_input()
        y = tenorsExtractionCallback.get_target()
        steps_per_epoch = 1
        epochs = 1
        ds_valid = None
        val_steps = 0
        shuffle = False

    model.fit(x=x,
              y=y,
              steps_per_epoch=steps_per_epoch,
              validation_data=ds_valid,
              validation_steps=val_steps,
              callbacks=callbacks,
              epochs=epochs,
              shuffle=shuffle)

    # training finished
    model.save('{}/{}-model-final.h5'.format(config.SAVE_DIR, save_name))